warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show
  1. warp/__init__.py +301 -287
  2. warp/__init__.pyi +794 -305
  3. warp/_src/__init__.py +14 -0
  4. warp/_src/autograd.py +1075 -0
  5. warp/_src/build.py +618 -0
  6. warp/_src/build_dll.py +640 -0
  7. warp/{builtins.py → _src/builtins.py} +1382 -377
  8. warp/_src/codegen.py +4359 -0
  9. warp/{config.py → _src/config.py} +178 -169
  10. warp/_src/constants.py +57 -0
  11. warp/_src/context.py +8294 -0
  12. warp/_src/dlpack.py +462 -0
  13. warp/_src/fabric.py +355 -0
  14. warp/_src/fem/__init__.py +14 -0
  15. warp/_src/fem/adaptivity.py +508 -0
  16. warp/_src/fem/cache.py +687 -0
  17. warp/_src/fem/dirichlet.py +188 -0
  18. warp/{fem → _src/fem}/domain.py +40 -30
  19. warp/_src/fem/field/__init__.py +131 -0
  20. warp/_src/fem/field/field.py +701 -0
  21. warp/{fem → _src/fem}/field/nodal_field.py +30 -15
  22. warp/{fem → _src/fem}/field/restriction.py +1 -1
  23. warp/{fem → _src/fem}/field/virtual.py +53 -27
  24. warp/_src/fem/geometry/__init__.py +32 -0
  25. warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
  26. warp/_src/fem/geometry/closest_point.py +97 -0
  27. warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
  28. warp/{fem → _src/fem}/geometry/element.py +32 -10
  29. warp/{fem → _src/fem}/geometry/geometry.py +48 -20
  30. warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
  31. warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
  32. warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
  33. warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
  34. warp/{fem → _src/fem}/geometry/partition.py +121 -63
  35. warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
  36. warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
  37. warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
  38. warp/{fem → _src/fem}/integrate.py +164 -158
  39. warp/_src/fem/linalg.py +383 -0
  40. warp/_src/fem/operator.py +396 -0
  41. warp/_src/fem/polynomial.py +229 -0
  42. warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
  43. warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
  44. warp/_src/fem/space/__init__.py +248 -0
  45. warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
  46. warp/_src/fem/space/basis_space.py +679 -0
  47. warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
  48. warp/{fem → _src/fem}/space/function_space.py +14 -13
  49. warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
  50. warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
  51. warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
  52. warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
  53. warp/{fem → _src/fem}/space/partition.py +117 -60
  54. warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
  55. warp/{fem → _src/fem}/space/restriction.py +66 -33
  56. warp/_src/fem/space/shape/__init__.py +152 -0
  57. warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
  58. warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
  59. warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
  60. warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
  61. warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
  62. warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
  63. warp/_src/fem/space/topology.py +459 -0
  64. warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
  65. warp/_src/fem/types.py +112 -0
  66. warp/_src/fem/utils.py +486 -0
  67. warp/_src/jax.py +186 -0
  68. warp/_src/jax_experimental/__init__.py +14 -0
  69. warp/_src/jax_experimental/custom_call.py +387 -0
  70. warp/_src/jax_experimental/ffi.py +1284 -0
  71. warp/_src/jax_experimental/xla_ffi.py +656 -0
  72. warp/_src/marching_cubes.py +708 -0
  73. warp/_src/math.py +414 -0
  74. warp/_src/optim/__init__.py +14 -0
  75. warp/_src/optim/adam.py +163 -0
  76. warp/_src/optim/linear.py +1606 -0
  77. warp/_src/optim/sgd.py +112 -0
  78. warp/_src/paddle.py +406 -0
  79. warp/_src/render/__init__.py +14 -0
  80. warp/_src/render/imgui_manager.py +289 -0
  81. warp/_src/render/render_opengl.py +3636 -0
  82. warp/_src/render/render_usd.py +937 -0
  83. warp/_src/render/utils.py +160 -0
  84. warp/_src/sparse.py +2716 -0
  85. warp/_src/tape.py +1206 -0
  86. warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
  87. warp/_src/torch.py +391 -0
  88. warp/_src/types.py +5870 -0
  89. warp/_src/utils.py +1693 -0
  90. warp/autograd.py +12 -1054
  91. warp/bin/warp-clang.so +0 -0
  92. warp/bin/warp.so +0 -0
  93. warp/build.py +8 -588
  94. warp/build_dll.py +6 -721
  95. warp/codegen.py +6 -4251
  96. warp/constants.py +6 -39
  97. warp/context.py +12 -8062
  98. warp/dlpack.py +6 -444
  99. warp/examples/distributed/example_jacobi_mpi.py +4 -5
  100. warp/examples/fem/example_adaptive_grid.py +1 -1
  101. warp/examples/fem/example_apic_fluid.py +1 -1
  102. warp/examples/fem/example_burgers.py +8 -8
  103. warp/examples/fem/example_diffusion.py +1 -1
  104. warp/examples/fem/example_distortion_energy.py +1 -1
  105. warp/examples/fem/example_mixed_elasticity.py +2 -2
  106. warp/examples/fem/example_navier_stokes.py +1 -1
  107. warp/examples/fem/example_nonconforming_contact.py +7 -7
  108. warp/examples/fem/example_stokes.py +1 -1
  109. warp/examples/fem/example_stokes_transfer.py +1 -1
  110. warp/examples/fem/utils.py +2 -2
  111. warp/examples/interop/example_jax_callable.py +1 -1
  112. warp/examples/interop/example_jax_ffi_callback.py +1 -1
  113. warp/examples/interop/example_jax_kernel.py +1 -1
  114. warp/examples/tile/example_tile_mcgp.py +191 -0
  115. warp/fabric.py +6 -337
  116. warp/fem/__init__.py +159 -97
  117. warp/fem/adaptivity.py +7 -489
  118. warp/fem/cache.py +9 -648
  119. warp/fem/dirichlet.py +6 -184
  120. warp/fem/field/__init__.py +8 -109
  121. warp/fem/field/field.py +7 -652
  122. warp/fem/geometry/__init__.py +7 -18
  123. warp/fem/geometry/closest_point.py +11 -77
  124. warp/fem/linalg.py +18 -366
  125. warp/fem/operator.py +11 -369
  126. warp/fem/polynomial.py +9 -209
  127. warp/fem/space/__init__.py +5 -211
  128. warp/fem/space/basis_space.py +6 -662
  129. warp/fem/space/shape/__init__.py +41 -118
  130. warp/fem/space/topology.py +6 -437
  131. warp/fem/types.py +6 -81
  132. warp/fem/utils.py +11 -444
  133. warp/jax.py +8 -165
  134. warp/jax_experimental/__init__.py +14 -1
  135. warp/jax_experimental/custom_call.py +8 -365
  136. warp/jax_experimental/ffi.py +17 -873
  137. warp/jax_experimental/xla_ffi.py +5 -605
  138. warp/marching_cubes.py +5 -689
  139. warp/math.py +16 -393
  140. warp/native/array.h +385 -37
  141. warp/native/builtin.h +314 -37
  142. warp/native/bvh.cpp +43 -9
  143. warp/native/bvh.cu +62 -27
  144. warp/native/bvh.h +310 -309
  145. warp/native/clang/clang.cpp +102 -97
  146. warp/native/coloring.cpp +0 -1
  147. warp/native/crt.h +208 -0
  148. warp/native/exports.h +156 -0
  149. warp/native/hashgrid.cu +2 -0
  150. warp/native/intersect.h +24 -1
  151. warp/native/intersect_tri.h +44 -35
  152. warp/native/mat.h +1456 -276
  153. warp/native/mesh.cpp +4 -4
  154. warp/native/mesh.cu +4 -2
  155. warp/native/mesh.h +176 -61
  156. warp/native/quat.h +0 -52
  157. warp/native/scan.cu +2 -0
  158. warp/native/sparse.cu +7 -3
  159. warp/native/spatial.h +12 -0
  160. warp/native/tile.h +681 -89
  161. warp/native/tile_radix_sort.h +1 -1
  162. warp/native/tile_reduce.h +394 -46
  163. warp/native/tile_scan.h +4 -4
  164. warp/native/vec.h +469 -0
  165. warp/native/version.h +23 -0
  166. warp/native/volume.cpp +1 -1
  167. warp/native/volume.cu +1 -0
  168. warp/native/volume.h +1 -1
  169. warp/native/volume_builder.cu +2 -0
  170. warp/native/warp.cpp +57 -29
  171. warp/native/warp.cu +253 -171
  172. warp/native/warp.h +11 -8
  173. warp/optim/__init__.py +6 -3
  174. warp/optim/adam.py +6 -145
  175. warp/optim/linear.py +14 -1585
  176. warp/optim/sgd.py +6 -94
  177. warp/paddle.py +6 -388
  178. warp/render/__init__.py +8 -4
  179. warp/render/imgui_manager.py +7 -267
  180. warp/render/render_opengl.py +6 -3618
  181. warp/render/render_usd.py +6 -919
  182. warp/render/utils.py +6 -142
  183. warp/sparse.py +37 -2563
  184. warp/tape.py +6 -1188
  185. warp/tests/__main__.py +1 -1
  186. warp/tests/cuda/test_async.py +4 -4
  187. warp/tests/cuda/test_conditional_captures.py +1 -1
  188. warp/tests/cuda/test_multigpu.py +1 -1
  189. warp/tests/cuda/test_streams.py +58 -1
  190. warp/tests/geometry/test_bvh.py +157 -22
  191. warp/tests/geometry/test_marching_cubes.py +0 -1
  192. warp/tests/geometry/test_mesh.py +5 -3
  193. warp/tests/geometry/test_mesh_query_aabb.py +5 -12
  194. warp/tests/geometry/test_mesh_query_point.py +5 -2
  195. warp/tests/geometry/test_mesh_query_ray.py +15 -3
  196. warp/tests/geometry/test_volume_write.py +5 -5
  197. warp/tests/interop/test_dlpack.py +14 -14
  198. warp/tests/interop/test_jax.py +772 -49
  199. warp/tests/interop/test_paddle.py +1 -1
  200. warp/tests/test_adam.py +0 -1
  201. warp/tests/test_arithmetic.py +9 -9
  202. warp/tests/test_array.py +527 -100
  203. warp/tests/test_array_reduce.py +3 -3
  204. warp/tests/test_atomic.py +12 -8
  205. warp/tests/test_atomic_bitwise.py +209 -0
  206. warp/tests/test_atomic_cas.py +4 -4
  207. warp/tests/test_bool.py +2 -2
  208. warp/tests/test_builtins_resolution.py +5 -571
  209. warp/tests/test_codegen.py +33 -14
  210. warp/tests/test_conditional.py +1 -1
  211. warp/tests/test_context.py +6 -6
  212. warp/tests/test_copy.py +242 -161
  213. warp/tests/test_ctypes.py +3 -3
  214. warp/tests/test_devices.py +24 -2
  215. warp/tests/test_examples.py +16 -84
  216. warp/tests/test_fabricarray.py +35 -35
  217. warp/tests/test_fast_math.py +0 -2
  218. warp/tests/test_fem.py +56 -10
  219. warp/tests/test_fixedarray.py +3 -3
  220. warp/tests/test_func.py +8 -5
  221. warp/tests/test_generics.py +1 -1
  222. warp/tests/test_indexedarray.py +24 -24
  223. warp/tests/test_intersect.py +39 -9
  224. warp/tests/test_large.py +1 -1
  225. warp/tests/test_lerp.py +3 -1
  226. warp/tests/test_linear_solvers.py +1 -1
  227. warp/tests/test_map.py +35 -4
  228. warp/tests/test_mat.py +52 -62
  229. warp/tests/test_mat_constructors.py +4 -5
  230. warp/tests/test_mat_lite.py +1 -1
  231. warp/tests/test_mat_scalar_ops.py +121 -121
  232. warp/tests/test_math.py +34 -0
  233. warp/tests/test_module_aot.py +4 -4
  234. warp/tests/test_modules_lite.py +28 -2
  235. warp/tests/test_print.py +11 -11
  236. warp/tests/test_quat.py +93 -58
  237. warp/tests/test_runlength_encode.py +1 -1
  238. warp/tests/test_scalar_ops.py +38 -10
  239. warp/tests/test_smoothstep.py +1 -1
  240. warp/tests/test_sparse.py +126 -15
  241. warp/tests/test_spatial.py +105 -87
  242. warp/tests/test_special_values.py +6 -6
  243. warp/tests/test_static.py +7 -7
  244. warp/tests/test_struct.py +13 -2
  245. warp/tests/test_triangle_closest_point.py +48 -1
  246. warp/tests/test_types.py +27 -15
  247. warp/tests/test_utils.py +52 -52
  248. warp/tests/test_vec.py +29 -29
  249. warp/tests/test_vec_constructors.py +5 -5
  250. warp/tests/test_vec_scalar_ops.py +97 -97
  251. warp/tests/test_version.py +75 -0
  252. warp/tests/tile/test_tile.py +178 -0
  253. warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
  254. warp/tests/tile/test_tile_cholesky.py +7 -4
  255. warp/tests/tile/test_tile_load.py +26 -2
  256. warp/tests/tile/test_tile_mathdx.py +3 -3
  257. warp/tests/tile/test_tile_matmul.py +1 -1
  258. warp/tests/tile/test_tile_mlp.py +2 -4
  259. warp/tests/tile/test_tile_reduce.py +214 -13
  260. warp/tests/unittest_suites.py +6 -14
  261. warp/tests/unittest_utils.py +10 -9
  262. warp/tests/walkthrough_debug.py +3 -1
  263. warp/torch.py +6 -373
  264. warp/types.py +29 -5764
  265. warp/utils.py +10 -1659
  266. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
  267. warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
  268. warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
  269. warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
  270. warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
  271. warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
  272. warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
  273. warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
  274. warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
  275. warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
  276. warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
  277. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
  278. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
  279. warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
  280. warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
  281. warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
  282. warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
  283. warp/examples/assets/cartpole.urdf +0 -110
  284. warp/examples/assets/crazyflie.usd +0 -0
  285. warp/examples/assets/nv_ant.xml +0 -92
  286. warp/examples/assets/nv_humanoid.xml +0 -183
  287. warp/examples/assets/quadruped.urdf +0 -268
  288. warp/examples/optim/example_bounce.py +0 -266
  289. warp/examples/optim/example_cloth_throw.py +0 -228
  290. warp/examples/optim/example_drone.py +0 -870
  291. warp/examples/optim/example_inverse_kinematics.py +0 -182
  292. warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
  293. warp/examples/optim/example_softbody_properties.py +0 -400
  294. warp/examples/optim/example_spring_cage.py +0 -245
  295. warp/examples/optim/example_trajectory.py +0 -227
  296. warp/examples/sim/example_cartpole.py +0 -143
  297. warp/examples/sim/example_cloth.py +0 -225
  298. warp/examples/sim/example_cloth_self_contact.py +0 -316
  299. warp/examples/sim/example_granular.py +0 -130
  300. warp/examples/sim/example_granular_collision_sdf.py +0 -202
  301. warp/examples/sim/example_jacobian_ik.py +0 -244
  302. warp/examples/sim/example_particle_chain.py +0 -124
  303. warp/examples/sim/example_quadruped.py +0 -203
  304. warp/examples/sim/example_rigid_chain.py +0 -203
  305. warp/examples/sim/example_rigid_contact.py +0 -195
  306. warp/examples/sim/example_rigid_force.py +0 -133
  307. warp/examples/sim/example_rigid_gyroscopic.py +0 -115
  308. warp/examples/sim/example_rigid_soft_contact.py +0 -140
  309. warp/examples/sim/example_soft_body.py +0 -196
  310. warp/examples/tile/example_tile_walker.py +0 -327
  311. warp/sim/__init__.py +0 -74
  312. warp/sim/articulation.py +0 -793
  313. warp/sim/collide.py +0 -2570
  314. warp/sim/graph_coloring.py +0 -307
  315. warp/sim/import_mjcf.py +0 -791
  316. warp/sim/import_snu.py +0 -227
  317. warp/sim/import_urdf.py +0 -579
  318. warp/sim/import_usd.py +0 -898
  319. warp/sim/inertia.py +0 -357
  320. warp/sim/integrator.py +0 -245
  321. warp/sim/integrator_euler.py +0 -2000
  322. warp/sim/integrator_featherstone.py +0 -2101
  323. warp/sim/integrator_vbd.py +0 -2487
  324. warp/sim/integrator_xpbd.py +0 -3295
  325. warp/sim/model.py +0 -4821
  326. warp/sim/particles.py +0 -121
  327. warp/sim/render.py +0 -431
  328. warp/sim/utils.py +0 -431
  329. warp/tests/sim/disabled_kinematics.py +0 -244
  330. warp/tests/sim/test_cloth.py +0 -863
  331. warp/tests/sim/test_collision.py +0 -743
  332. warp/tests/sim/test_coloring.py +0 -347
  333. warp/tests/sim/test_inertia.py +0 -161
  334. warp/tests/sim/test_model.py +0 -226
  335. warp/tests/sim/test_sim_grad.py +0 -287
  336. warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
  337. warp/tests/sim/test_sim_kinematics.py +0 -98
  338. warp/thirdparty/__init__.py +0 -0
  339. warp_lang-1.9.1.dist-info/RECORD +0 -456
  340. /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
  341. /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
  342. /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
  343. /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
  344. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
  345. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
  346. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@
16
16
  */
17
17
 
18
18
  #include "../native/crt.h"
19
+ #include "../version.h"
19
20
 
20
21
  #include <clang/Frontend/CompilerInstance.h>
21
22
  #include <clang/Basic/DiagnosticOptions.h>
@@ -58,27 +59,14 @@
58
59
  #if defined(_WIN64)
59
60
  extern "C" void __chkstk();
60
61
  #elif defined(__APPLE__)
61
-
62
- #if defined(__MACH__) && defined(__aarch64__)
63
62
  extern "C" void _bzero(void *s, size_t n) {
64
63
  memset(s, 0, n);
65
64
  }
66
65
  extern "C" void __bzero(void *s, size_t n) {
67
66
  memset(s, 0, n);
68
67
  }
69
-
70
- extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
71
- extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
72
-
73
- #else
74
- // // Intel Mac's define bzero in libSystem.dylib
75
- extern "C" void __bzero(void *s, size_t n);
76
-
77
68
  extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
78
69
  extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
79
-
80
- #endif
81
-
82
70
  extern "C" __double2 __sincos_stret(double);
83
71
  extern "C" __float2 __sincosf_stret(float);
84
72
  #endif // defined(__APPLE__)
@@ -114,7 +102,7 @@ static void initialize_llvm()
114
102
  llvm::InitializeAllAsmPrinters();
115
103
  }
116
104
 
117
- static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, bool verify_fp, llvm::LLVMContext& context)
105
+ static std::unique_ptr<llvm::Module> source_to_llvm(bool is_cuda, const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, bool verify_fp, llvm::LLVMContext& context, bool tiles_in_stack_memory)
118
106
  {
119
107
  // Compilation arguments
120
108
  std::vector<const char*> args;
@@ -125,84 +113,50 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
125
113
 
126
114
  args.push_back(debug ? "-O0" : "-O2");
127
115
 
128
- args.push_back("-triple");
129
- args.push_back(target_triple);
130
-
131
- #if defined(__x86_64__) || defined(_M_X64)
132
- args.push_back("-target-feature");
133
- args.push_back("+f16c"); // Enables support for _Float16
134
- #endif
135
-
136
- clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
137
- std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
138
- std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
139
- clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
140
- std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
141
- std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
142
-
143
- clang::CompilerInstance compiler_instance;
144
-
145
- auto& compiler_invocation = compiler_instance.getInvocation();
146
- clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
147
-
148
- if(debug)
116
+ if(is_cuda)
149
117
  {
150
- #if LLVM_VERSION_MAJOR >= 18
151
- compiler_invocation.getCodeGenOpts().setDebugInfo(llvm::codegenoptions::FullDebugInfo);
152
- #else
153
- compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
154
- #endif
155
- }
118
+ args.push_back("-triple");
119
+ args.push_back("nvptx64-nvidia-cuda");
156
120
 
157
- // Map code to a MemoryBuffer
158
- std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
159
- compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
160
-
161
- if(!debug)
162
- {
163
- compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
121
+ args.push_back("-target-cpu");
122
+ args.push_back("sm_70");
164
123
  }
165
-
166
- if(verify_fp)
124
+ else
167
125
  {
168
- compiler_instance.getPreprocessorOpts().addMacroDef("WP_VERIFY_FP");
169
- }
170
-
171
- compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
172
- compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec
173
-
174
- compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
126
+ args.push_back("-triple");
127
+ args.push_back(target_triple);
175
128
 
176
- clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
177
- bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
178
- (void)buffer.release();
179
-
180
- return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
181
- }
182
-
183
- static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
184
- {
185
- // Compilation arguments
186
- std::vector<const char*> args;
187
- args.push_back(input_file.c_str());
188
-
189
- args.push_back("-I");
190
- args.push_back(include_dir);
191
-
192
- args.push_back(debug ? "-O0" : "-O2");
193
-
194
- args.push_back("-triple");
195
- args.push_back("nvptx64-nvidia-cuda");
129
+ #if defined(__x86_64__) || defined(_M_X64)
130
+ args.push_back("-target-feature");
131
+ args.push_back("+f16c"); // Enables support for _Float16
132
+ #endif
196
133
 
197
- args.push_back("-target-cpu");
198
- args.push_back("sm_70");
134
+ #if defined(__aarch64__)
135
+ if(tiles_in_stack_memory)
136
+ {
137
+ // Static memory support is broken on AArch64 CPUs. As a workaround we reserve some stack memory on kernel entry,
138
+ // and point the callee-saved x28 register to it so we can access it anywhere. See tile_shared_storage_t in tile.h.
139
+ args.push_back("-target-feature");
140
+ args.push_back("+reserve-x28");
141
+ }
142
+ #endif
143
+ }
199
144
 
145
+ #if LLVM_VERSION_MAJOR >= 21
146
+ clang::DiagnosticOptions diagnostic_options;
147
+ std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
148
+ std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), diagnostic_options);
149
+ clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
150
+ std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
151
+ std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, diagnostic_options, text_diagnostic_printer.release());
152
+ #else
200
153
  clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
201
154
  std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
202
155
  std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
203
156
  clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
204
157
  std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
205
158
  std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
159
+ #endif
206
160
 
207
161
  clang::CompilerInstance compiler_instance;
208
162
 
@@ -222,21 +176,43 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
222
176
  std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
223
177
  compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
224
178
 
225
- // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
226
- // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
227
- // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
228
- compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
229
-
230
179
  if(!debug)
231
180
  {
232
181
  compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
233
182
  }
183
+
184
+ if(is_cuda)
185
+ {
186
+ // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
187
+ // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
188
+ // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
189
+ compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
190
+
191
+ compiler_instance.getLangOpts().CUDA = 1;
192
+ compiler_instance.getLangOpts().CUDAIsDevice = 1;
193
+ compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
194
+ }
195
+ else
196
+ {
197
+ if(verify_fp)
198
+ {
199
+ compiler_instance.getPreprocessorOpts().addMacroDef("WP_VERIFY_FP");
200
+ }
234
201
 
235
- compiler_instance.getLangOpts().CUDA = 1;
236
- compiler_instance.getLangOpts().CUDAIsDevice = 1;
237
- compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
202
+ if(tiles_in_stack_memory)
203
+ {
204
+ compiler_instance.getPreprocessorOpts().addMacroDef("WP_ENABLE_TILES_IN_STACK_MEMORY");
205
+ }
206
+
207
+ compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
208
+ compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec
209
+ }
238
210
 
211
+ #if LLVM_VERSION_MAJOR >= 21
212
+ compiler_instance.createDiagnostics(*llvm::vfs::getRealFileSystem(), text_diagnostic_printer.get(), false);
213
+ #else
239
214
  compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
215
+ #endif
240
216
 
241
217
  clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
242
218
  bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
@@ -247,12 +223,12 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
247
223
 
248
224
  extern "C" {
249
225
 
250
- WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
226
+ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp, bool tiles_in_stack_memory)
251
227
  {
252
228
  initialize_llvm();
253
229
 
254
230
  llvm::LLVMContext context;
255
- std::unique_ptr<llvm::Module> module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, verify_fp, context);
231
+ std::unique_ptr<llvm::Module> module = source_to_llvm(false, input_file, cpp_src, include_dir, debug, verify_fp, context, tiles_in_stack_memory);
256
232
 
257
233
  if(!module)
258
234
  {
@@ -260,7 +236,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
260
236
  }
261
237
 
262
238
  std::string error;
239
+ #if LLVM_VERSION_MAJOR >= 22
240
+ const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple(target_triple), error);
241
+ #else
263
242
  const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
243
+ #endif
264
244
 
265
245
  const char* CPU = "generic";
266
246
  const char* features = "";
@@ -271,7 +251,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
271
251
  target_options.AllowFPOpFusion = llvm::FPOpFusion::Strict;
272
252
  llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code
273
253
  llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes
254
+ #if LLVM_VERSION_MAJOR >= 20
255
+ llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple(target_triple), CPU, features, target_options, relocation_model, code_model);
256
+ #else
274
257
  llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
258
+ #endif
275
259
 
276
260
  module->setDataLayout(target_machine->createDataLayout());
277
261
 
@@ -299,7 +283,7 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
299
283
  initialize_llvm();
300
284
 
301
285
  llvm::LLVMContext context;
302
- std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);
286
+ std::unique_ptr<llvm::Module> module = source_to_llvm(true, input_file, cpp_src, include_dir, debug, false, context, false);
303
287
 
304
288
  if(!module)
305
289
  {
@@ -307,13 +291,22 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
307
291
  }
308
292
 
309
293
  std::string error;
294
+
295
+ #if LLVM_VERSION_MAJOR >= 22
296
+ const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple("nvptx64-nvidia-cuda"), error);
297
+ #else
310
298
  const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
299
+ #endif
311
300
 
312
301
  const char* CPU = "sm_70";
313
302
  const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5
314
303
  llvm::TargetOptions target_options;
315
304
  llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
305
+ #if LLVM_VERSION_MAJOR >= 20
306
+ llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple("nvptx64-nvidia-cuda"), CPU, features, target_options, relocation_model);
307
+ #else
316
308
  llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
309
+ #endif
317
310
 
318
311
  module->setDataLayout(target_machine->createDataLayout());
319
312
 
@@ -363,8 +356,16 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
363
356
 
364
357
  auto jit_expected = llvm::orc::LLJITBuilder()
365
358
  .setObjectLinkingLayerCreator(
359
+ #if LLVM_VERSION_MAJOR >= 21
360
+ [&](llvm::orc::ExecutionSession &session) {
361
+ #else
366
362
  [&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
363
+ #endif
364
+ #if LLVM_VERSION_MAJOR >= 21
365
+ auto get_memory_manager = [](const llvm::MemoryBuffer &) {
366
+ #else
367
367
  auto get_memory_manager = []() {
368
+ #endif
368
369
  return std::make_unique<llvm::SectionMemoryManager>();
369
370
  };
370
371
  auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager));
@@ -443,6 +444,10 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
443
444
  SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)),
444
445
  SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)),
445
446
  SYMBOL(fmaf), SYMBOL_T(fma, double(*)(double, double, double)),
447
+ SYMBOL(erff), SYMBOL_T(erf, double(*)(double)),
448
+ SYMBOL(erfcf), SYMBOL_T(erfc, double(*)(double)),
449
+ SYMBOL(erfinvf), SYMBOL_T(erfinv, double(*)(double)),
450
+ SYMBOL(erfcinvf), SYMBOL_T(erfcinv, double(*)(double)),
446
451
  SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove),
447
452
  SYMBOL(_wp_assert),
448
453
  SYMBOL(_wp_isfinite),
@@ -454,13 +459,8 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
454
459
  // triggering the stack overflow guards.
455
460
  SYMBOL(__chkstk),
456
461
  #elif defined(__APPLE__)
457
- #if defined(__MACH__) && defined(__aarch64__)
458
- SYMBOL(bzero),
459
- SYMBOL(_bzero),
460
- #else
461
- // Intel Mac
462
- SYMBOL(__bzero),
463
- #endif
462
+ SYMBOL(bzero),
463
+ SYMBOL(_bzero),
464
464
  SYMBOL(memset_pattern16),
465
465
  SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
466
466
  #else
@@ -531,6 +531,11 @@ WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
531
531
  return func->getValue();
532
532
  }
533
533
 
534
+ WP_API const char* wp_warp_clang_version()
535
+ {
536
+ return WP_VERSION_STRING;
537
+ }
538
+
534
539
  } // extern "C"
535
540
 
536
541
  } // namespace wp
warp/native/coloring.cpp CHANGED
@@ -346,7 +346,6 @@ public:
346
346
  return node_weights[node_idx];
347
347
  }
348
348
 
349
-
350
349
  void add_node(int weight, int node_idx)
351
350
  {
352
351
  if (weight >= weight_buckets.size())
warp/native/crt.h CHANGED
@@ -311,6 +311,14 @@ float tanhf(float);
311
311
  double tanh(double);
312
312
  float fmaf(float, float, float);
313
313
  double fma(double, double, double);
314
+ double erf(double);
315
+ float erff(float);
316
+ double erfc(double);
317
+ float erfcf(float);
318
+ double erfinv(double);
319
+ float erfinvf(float);
320
+ double erfcinv(double);
321
+ float erfcinvf(float);
314
322
 
315
323
  // stddef.h
316
324
  #if defined(_WIN32)
@@ -358,3 +366,203 @@ inline bool isinf(double x)
358
366
  #endif // !__CUDACC__
359
367
 
360
368
  #endif // WP_NO_CRT
369
+
370
+ #if !defined(__CUDACC__)
371
+
372
+ /*
373
+ * From Cephes Library polevl.c
374
+ * Original source: https://www.netlib.org/cephes/
375
+ * Copyright (c) 1984 by Stephen L. Moshier.
376
+ * All rights reserved.
377
+ */
378
+ // evaluate polynomial using Horner's method
379
+ static inline double polevl(double x, const double* coefs, int N)
380
+ {
381
+ double ans = coefs[0];
382
+ for (int i = 1; i <= N; i++)
383
+ {
384
+ ans = ans * x + coefs[i];
385
+ }
386
+ return ans;
387
+ }
388
+
389
+ /*
390
+ * From Cephes Library polevl.c
391
+ * Original source: https://www.netlib.org/cephes/
392
+ * Copyright (c) 1984 by Stephen L. Moshier.
393
+ * All rights reserved.
394
+ */
395
+ // evaluate polynomial assuming leading coef = 1, using Horner's method
396
+ static inline double p1evl(double x, const double* coefs, int N)
397
+ {
398
+ double ans = x + coefs[0];
399
+ for (int i = 1; i < N; i++)
400
+ {
401
+ ans = ans * x + coefs[i];
402
+ }
403
+ return ans;
404
+ }
405
+
406
+ /*
407
+ * From Cephes Library ndtri.c
408
+ * Original source: https://www.netlib.org/cephes/
409
+ * Copyright (c) 1984 by Stephen L. Moshier.
410
+ * All rights reserved.
411
+ */
412
+ // inverse normal distribution function (ndtri)
413
+ static inline double ndtri(double y)
414
+ {
415
+ // domain check
416
+ if (y <= 0.0 || y >= 1.0)
417
+ {
418
+ return (y <= 0.0) ? -HUGE_VAL : HUGE_VAL;
419
+ }
420
+
421
+ // constants from Cephes
422
+ const double s2pi = 2.50662827463100050242E0; // sqrt(2*pi)
423
+ const double exp_neg2 = 0.13533528323661269189; // exp(-2)
424
+
425
+ // approximation for 0 <= abs(z - 0.5) <= 3/8
426
+ static const double P0[5] = {
427
+ -5.99633501014107895267e1,
428
+ 9.80010754185999661536e1,
429
+ -5.66762857469070293439e1,
430
+ 1.39312609387279679503e1,
431
+ -1.23916583867381258016e0
432
+ };
433
+
434
+ static const double Q0[8] = {
435
+ 1.95448858338141759834e0,
436
+ 4.67627912898881538453e0,
437
+ 8.63602421390890590575e1,
438
+ -2.25462687854119370527e2,
439
+ 2.00260212380060660359e2,
440
+ -8.20372256168333339912e1,
441
+ 1.59056225126211695515e1,
442
+ -1.18331621121330003142e0
443
+ };
444
+
445
+ // approximation for interval z = sqrt(-2 log y) between 2 and 8
446
+ static const double P1[9] = {
447
+ 4.05544892305962419923e0,
448
+ 3.15251094599893866154e1,
449
+ 5.71628192246421288162e1,
450
+ 4.40805073893200834700e1,
451
+ 1.46849561928858024014e1,
452
+ 2.18663306850790267539e0,
453
+ -1.40256079171354495875e-1,
454
+ -3.50424626827848203418e-2,
455
+ -8.57456785154685413611e-4
456
+ };
457
+
458
+ static const double Q1[8] = {
459
+ 1.57799883256466749731e1,
460
+ 4.53907635128879210584e1,
461
+ 4.13172038254672030440e1,
462
+ 1.50425385692907503408e1,
463
+ 2.50464946208309415979e0,
464
+ -1.42182922854787788574e-1,
465
+ -3.80806407691578277194e-2,
466
+ -9.33259480895457427372e-4
467
+ };
468
+
469
+ // approximation for interval z = sqrt(-2 log y) between 8 and 64
470
+ static const double P2[9] = {
471
+ 3.23774891776946035970e0,
472
+ 6.91522889068984211695e0,
473
+ 3.93881025292474443415e0,
474
+ 1.33303460815807542389e0,
475
+ 2.01485389549179081538e-1,
476
+ 1.23716634817820021358e-2,
477
+ 3.01581553508235416007e-4,
478
+ 2.65806974686737550832e-6,
479
+ 6.23974539184983293730e-9
480
+ };
481
+
482
+ static const double Q2[8] = {
483
+ 6.02427039364742014255e0,
484
+ 3.67983563856160859403e0,
485
+ 1.37702099489081330271e0,
486
+ 2.16236993594496635890e-1,
487
+ 1.34204006088543189037e-2,
488
+ 3.28014464682127739104e-4,
489
+ 2.89247864745380683936e-6,
490
+ 6.79019408009981274425e-9
491
+ };
492
+
493
+ int code = 1;
494
+ double y_work = y;
495
+
496
+ if (y_work > (1.0 - exp_neg2))
497
+ {
498
+ y_work = 1.0 - y_work;
499
+ code = 0;
500
+ }
501
+
502
+ // middle region: 0 <= |y - 0.5| <= 3/8
503
+ if (y_work > exp_neg2)
504
+ {
505
+ y_work -= 0.5;
506
+ double y2 = y_work * y_work;
507
+ double x = y_work + y_work * (y2 * polevl(y2, P0, 4) / p1evl(y2, Q0, 8));
508
+ x = x * s2pi;
509
+ return x;
510
+ }
511
+
512
+ double x = ::sqrt(-2.0 * ::log(y_work));
513
+ double x0 = x - ::log(x) / x;
514
+
515
+ double z = 1.0 / x;
516
+ double x1;
517
+ if (x < 8.0)
518
+ {
519
+ x1 = z * polevl(z, P1, 8) / p1evl(z, Q1, 8);
520
+ }
521
+ else
522
+ {
523
+ x1 = z * polevl(z, P2, 8) / p1evl(z, Q2, 8);
524
+ }
525
+
526
+ x = x0 - x1;
527
+ if (code != 0)
528
+ {
529
+ x = -x;
530
+ }
531
+
532
+ return x;
533
+ }
534
+
535
+ // inverse error function (not in standard C library)
536
+ // only compiled for non-CUDA builds - CUDA provides these in its math headers
537
+ inline double erfinv(double z)
538
+ {
539
+ // handle special cases
540
+ if (z == 0.0)
541
+ return 0.0;
542
+ if (z == 1.0)
543
+ return HUGE_VAL; // infinity
544
+ if (z == -1.0)
545
+ return -HUGE_VAL; // -infinity
546
+ if (z < -1.0 || z > 1.0)
547
+ return NAN; // outside valid range
548
+
549
+ // erfinv(z) = ndtri((z + 1) / 2) / sqrt(2)
550
+ return ndtri((z + 1.0) / 2.0) / ::sqrt(2.0);
551
+ }
552
+
553
+ inline float erfinvf(float x)
554
+ {
555
+ return (float)erfinv((double)x);
556
+ }
557
+
558
+ inline double erfcinv(double x)
559
+ {
560
+ return erfinv(1.0 - x);
561
+ }
562
+
563
+ inline float erfcinvf(float x)
564
+ {
565
+ return (float)erfcinv((double)x);
566
+ }
567
+
568
+ #endif // !defined(__CUDACC__)