warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show
  1. warp/__init__.py +15 -7
  2. warp/__init__.pyi +1 -0
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +22 -443
  6. warp/build_dll.py +384 -0
  7. warp/builtins.py +998 -488
  8. warp/codegen.py +1307 -739
  9. warp/config.py +5 -3
  10. warp/constants.py +6 -0
  11. warp/context.py +1291 -548
  12. warp/dlpack.py +31 -31
  13. warp/fabric.py +326 -0
  14. warp/fem/__init__.py +27 -0
  15. warp/fem/cache.py +389 -0
  16. warp/fem/dirichlet.py +181 -0
  17. warp/fem/domain.py +263 -0
  18. warp/fem/field/__init__.py +101 -0
  19. warp/fem/field/field.py +149 -0
  20. warp/fem/field/nodal_field.py +299 -0
  21. warp/fem/field/restriction.py +21 -0
  22. warp/fem/field/test.py +181 -0
  23. warp/fem/field/trial.py +183 -0
  24. warp/fem/geometry/__init__.py +19 -0
  25. warp/fem/geometry/closest_point.py +70 -0
  26. warp/fem/geometry/deformed_geometry.py +271 -0
  27. warp/fem/geometry/element.py +744 -0
  28. warp/fem/geometry/geometry.py +186 -0
  29. warp/fem/geometry/grid_2d.py +373 -0
  30. warp/fem/geometry/grid_3d.py +435 -0
  31. warp/fem/geometry/hexmesh.py +953 -0
  32. warp/fem/geometry/partition.py +376 -0
  33. warp/fem/geometry/quadmesh_2d.py +532 -0
  34. warp/fem/geometry/tetmesh.py +840 -0
  35. warp/fem/geometry/trimesh_2d.py +577 -0
  36. warp/fem/integrate.py +1616 -0
  37. warp/fem/operator.py +191 -0
  38. warp/fem/polynomial.py +213 -0
  39. warp/fem/quadrature/__init__.py +2 -0
  40. warp/fem/quadrature/pic_quadrature.py +245 -0
  41. warp/fem/quadrature/quadrature.py +294 -0
  42. warp/fem/space/__init__.py +292 -0
  43. warp/fem/space/basis_space.py +489 -0
  44. warp/fem/space/collocated_function_space.py +105 -0
  45. warp/fem/space/dof_mapper.py +236 -0
  46. warp/fem/space/function_space.py +145 -0
  47. warp/fem/space/grid_2d_function_space.py +267 -0
  48. warp/fem/space/grid_3d_function_space.py +306 -0
  49. warp/fem/space/hexmesh_function_space.py +352 -0
  50. warp/fem/space/partition.py +350 -0
  51. warp/fem/space/quadmesh_2d_function_space.py +369 -0
  52. warp/fem/space/restriction.py +160 -0
  53. warp/fem/space/shape/__init__.py +15 -0
  54. warp/fem/space/shape/cube_shape_function.py +738 -0
  55. warp/fem/space/shape/shape_function.py +103 -0
  56. warp/fem/space/shape/square_shape_function.py +611 -0
  57. warp/fem/space/shape/tet_shape_function.py +567 -0
  58. warp/fem/space/shape/triangle_shape_function.py +429 -0
  59. warp/fem/space/tetmesh_function_space.py +292 -0
  60. warp/fem/space/topology.py +295 -0
  61. warp/fem/space/trimesh_2d_function_space.py +221 -0
  62. warp/fem/types.py +77 -0
  63. warp/fem/utils.py +495 -0
  64. warp/native/array.h +164 -55
  65. warp/native/builtin.h +150 -174
  66. warp/native/bvh.cpp +75 -328
  67. warp/native/bvh.cu +406 -23
  68. warp/native/bvh.h +37 -45
  69. warp/native/clang/clang.cpp +136 -24
  70. warp/native/crt.cpp +1 -76
  71. warp/native/crt.h +111 -104
  72. warp/native/cuda_crt.h +1049 -0
  73. warp/native/cuda_util.cpp +15 -3
  74. warp/native/cuda_util.h +3 -1
  75. warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
  76. warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
  77. warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
  78. warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
  79. warp/native/cutlass/tools/library/scripts/library.py +799 -0
  80. warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
  81. warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
  82. warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
  83. warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
  84. warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
  85. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
  86. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
  87. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
  88. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
  89. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
  90. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
  91. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
  92. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
  93. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
  94. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
  95. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
  96. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
  97. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
  98. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
  99. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
  100. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
  101. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
  102. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
  103. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
  104. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
  105. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
  106. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
  107. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
  108. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
  109. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
  110. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
  111. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
  112. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
  113. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
  114. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
  115. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
  116. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
  117. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
  118. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
  119. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
  120. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
  121. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
  122. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
  123. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
  124. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
  125. warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
  126. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
  127. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
  128. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
  129. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
  130. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
  131. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
  132. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
  133. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
  134. warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
  135. warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
  136. warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
  137. warp/native/cutlass/tools/library/scripts/rt.py +796 -0
  138. warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
  139. warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
  140. warp/native/cutlass_gemm.cu +5 -3
  141. warp/native/exports.h +1240 -949
  142. warp/native/fabric.h +228 -0
  143. warp/native/hashgrid.cpp +4 -4
  144. warp/native/hashgrid.h +22 -2
  145. warp/native/initializer_array.h +2 -2
  146. warp/native/intersect.h +22 -7
  147. warp/native/intersect_adj.h +8 -8
  148. warp/native/intersect_tri.h +13 -16
  149. warp/native/marching.cu +157 -161
  150. warp/native/mat.h +119 -19
  151. warp/native/matnn.h +2 -2
  152. warp/native/mesh.cpp +108 -83
  153. warp/native/mesh.cu +243 -6
  154. warp/native/mesh.h +1547 -458
  155. warp/native/nanovdb/NanoVDB.h +1 -1
  156. warp/native/noise.h +272 -329
  157. warp/native/quat.h +51 -8
  158. warp/native/rand.h +45 -35
  159. warp/native/range.h +6 -2
  160. warp/native/reduce.cpp +157 -0
  161. warp/native/reduce.cu +348 -0
  162. warp/native/runlength_encode.cpp +62 -0
  163. warp/native/runlength_encode.cu +46 -0
  164. warp/native/scan.cu +11 -13
  165. warp/native/scan.h +1 -0
  166. warp/native/solid_angle.h +442 -0
  167. warp/native/sort.cpp +13 -0
  168. warp/native/sort.cu +9 -1
  169. warp/native/sparse.cpp +338 -0
  170. warp/native/sparse.cu +545 -0
  171. warp/native/spatial.h +2 -2
  172. warp/native/temp_buffer.h +30 -0
  173. warp/native/vec.h +126 -24
  174. warp/native/volume.h +120 -0
  175. warp/native/warp.cpp +658 -53
  176. warp/native/warp.cu +660 -68
  177. warp/native/warp.h +112 -12
  178. warp/optim/__init__.py +1 -0
  179. warp/optim/linear.py +922 -0
  180. warp/optim/sgd.py +92 -0
  181. warp/render/render_opengl.py +392 -152
  182. warp/render/render_usd.py +11 -11
  183. warp/sim/__init__.py +2 -2
  184. warp/sim/articulation.py +385 -185
  185. warp/sim/collide.py +21 -8
  186. warp/sim/import_mjcf.py +297 -106
  187. warp/sim/import_urdf.py +389 -210
  188. warp/sim/import_usd.py +198 -97
  189. warp/sim/inertia.py +17 -18
  190. warp/sim/integrator_euler.py +14 -8
  191. warp/sim/integrator_xpbd.py +161 -19
  192. warp/sim/model.py +795 -291
  193. warp/sim/optimizer.py +2 -6
  194. warp/sim/render.py +65 -3
  195. warp/sim/utils.py +3 -0
  196. warp/sparse.py +1227 -0
  197. warp/stubs.py +665 -223
  198. warp/tape.py +66 -15
  199. warp/tests/__main__.py +3 -6
  200. warp/tests/assets/curlnoise_golden.npy +0 -0
  201. warp/tests/assets/pnoise_golden.npy +0 -0
  202. warp/tests/assets/torus.usda +105 -105
  203. warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
  204. warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
  205. warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
  206. warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
  207. warp/tests/aux_test_unresolved_func.py +14 -0
  208. warp/tests/aux_test_unresolved_symbol.py +14 -0
  209. warp/tests/disabled_kinematics.py +239 -0
  210. warp/tests/run_coverage_serial.py +31 -0
  211. warp/tests/test_adam.py +103 -106
  212. warp/tests/test_arithmetic.py +128 -74
  213. warp/tests/test_array.py +1497 -211
  214. warp/tests/test_array_reduce.py +150 -0
  215. warp/tests/test_atomic.py +64 -28
  216. warp/tests/test_bool.py +99 -0
  217. warp/tests/test_builtins_resolution.py +1292 -0
  218. warp/tests/test_bvh.py +75 -43
  219. warp/tests/test_closest_point_edge_edge.py +54 -57
  220. warp/tests/test_codegen.py +233 -128
  221. warp/tests/test_compile_consts.py +28 -20
  222. warp/tests/test_conditional.py +108 -24
  223. warp/tests/test_copy.py +10 -12
  224. warp/tests/test_ctypes.py +112 -88
  225. warp/tests/test_dense.py +21 -14
  226. warp/tests/test_devices.py +98 -0
  227. warp/tests/test_dlpack.py +136 -108
  228. warp/tests/test_examples.py +277 -0
  229. warp/tests/test_fabricarray.py +955 -0
  230. warp/tests/test_fast_math.py +15 -11
  231. warp/tests/test_fem.py +1271 -0
  232. warp/tests/test_fp16.py +53 -19
  233. warp/tests/test_func.py +187 -74
  234. warp/tests/test_generics.py +194 -49
  235. warp/tests/test_grad.py +180 -116
  236. warp/tests/test_grad_customs.py +176 -0
  237. warp/tests/test_hash_grid.py +52 -37
  238. warp/tests/test_import.py +10 -23
  239. warp/tests/test_indexedarray.py +577 -24
  240. warp/tests/test_intersect.py +18 -9
  241. warp/tests/test_large.py +141 -0
  242. warp/tests/test_launch.py +251 -15
  243. warp/tests/test_lerp.py +64 -65
  244. warp/tests/test_linear_solvers.py +154 -0
  245. warp/tests/test_lvalue.py +493 -0
  246. warp/tests/test_marching_cubes.py +12 -13
  247. warp/tests/test_mat.py +508 -2778
  248. warp/tests/test_mat_lite.py +115 -0
  249. warp/tests/test_mat_scalar_ops.py +2889 -0
  250. warp/tests/test_math.py +103 -9
  251. warp/tests/test_matmul.py +305 -69
  252. warp/tests/test_matmul_lite.py +410 -0
  253. warp/tests/test_mesh.py +71 -14
  254. warp/tests/test_mesh_query_aabb.py +41 -25
  255. warp/tests/test_mesh_query_point.py +325 -34
  256. warp/tests/test_mesh_query_ray.py +39 -22
  257. warp/tests/test_mlp.py +30 -22
  258. warp/tests/test_model.py +92 -89
  259. warp/tests/test_modules_lite.py +39 -0
  260. warp/tests/test_multigpu.py +88 -114
  261. warp/tests/test_noise.py +12 -11
  262. warp/tests/test_operators.py +16 -20
  263. warp/tests/test_options.py +11 -11
  264. warp/tests/test_pinned.py +17 -18
  265. warp/tests/test_print.py +32 -11
  266. warp/tests/test_quat.py +275 -129
  267. warp/tests/test_rand.py +18 -16
  268. warp/tests/test_reload.py +38 -34
  269. warp/tests/test_rounding.py +50 -43
  270. warp/tests/test_runlength_encode.py +190 -0
  271. warp/tests/test_smoothstep.py +9 -11
  272. warp/tests/test_snippet.py +143 -0
  273. warp/tests/test_sparse.py +460 -0
  274. warp/tests/test_spatial.py +276 -243
  275. warp/tests/test_streams.py +110 -85
  276. warp/tests/test_struct.py +331 -85
  277. warp/tests/test_tape.py +39 -21
  278. warp/tests/test_torch.py +118 -89
  279. warp/tests/test_transient_module.py +12 -13
  280. warp/tests/test_types.py +614 -0
  281. warp/tests/test_utils.py +494 -0
  282. warp/tests/test_vec.py +354 -1987
  283. warp/tests/test_vec_lite.py +73 -0
  284. warp/tests/test_vec_scalar_ops.py +2099 -0
  285. warp/tests/test_volume.py +457 -293
  286. warp/tests/test_volume_write.py +124 -134
  287. warp/tests/unittest_serial.py +35 -0
  288. warp/tests/unittest_suites.py +341 -0
  289. warp/tests/unittest_utils.py +568 -0
  290. warp/tests/unused_test_misc.py +71 -0
  291. warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
  292. warp/thirdparty/appdirs.py +36 -45
  293. warp/thirdparty/unittest_parallel.py +549 -0
  294. warp/torch.py +72 -30
  295. warp/types.py +1744 -713
  296. warp/utils.py +360 -350
  297. warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
  298. warp_lang-0.11.0.dist-info/METADATA +238 -0
  299. warp_lang-0.11.0.dist-info/RECORD +332 -0
  300. {warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
  301. warp/bin/warp-clang.exp +0 -0
  302. warp/bin/warp-clang.lib +0 -0
  303. warp/bin/warp.exp +0 -0
  304. warp/bin/warp.lib +0 -0
  305. warp/tests/test_all.py +0 -215
  306. warp/tests/test_array_scan.py +0 -60
  307. warp/tests/test_base.py +0 -208
  308. warp/tests/test_unresolved_func.py +0 -7
  309. warp/tests/test_unresolved_symbol.py +0 -7
  310. warp_lang-0.9.0.dist-info/METADATA +0 -20
  311. warp_lang-0.9.0.dist-info/RECORD +0 -177
  312. /warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
  313. /warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
  314. /warp/tests/{test_square.py → aux_test_square.py} +0 -0
  315. {warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,8 @@
25
25
  #include <llvm/PassRegistry.h>
26
26
  #include <llvm/InitializePasses.h>
27
27
  #include <llvm/IR/LegacyPassManager.h>
28
+ #include <llvm/IRReader/IRReader.h>
29
+ #include <llvm/Linker/Linker.h>
28
30
 
29
31
  #include <llvm/ExecutionEngine/Orc/LLJIT.h>
30
32
  #include <llvm/ExecutionEngine/JITEventListener.h>
@@ -45,6 +47,7 @@
45
47
  #elif defined(__APPLE__)
46
48
  extern "C" void __bzero(void*, size_t);
47
49
  extern "C" __double2 __sincos_stret(double);
50
+ extern "C" __float2 __sincosf_stret(float);
48
51
  #endif
49
52
 
50
53
  extern "C" {
@@ -54,21 +57,20 @@ extern "C" {
54
57
  // On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain
55
58
  // their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug
56
59
  // info. By forward-declaring them here it suffices to compile this file with /Zi.
57
- struct jit_descriptor;
58
- extern jit_descriptor __jit_debug_descriptor;
60
+ extern struct jit_descriptor __jit_debug_descriptor;
59
61
  extern void __jit_debug_register_code();
60
62
 
61
63
  }
62
64
 
63
65
  namespace wp {
64
-
66
+
65
67
  #if defined (_WIN32)
66
- // Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
67
- // Override it to use the ELF format to support DWARF debug info, but keep using the
68
- // Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
69
- static const char* target_triple = "x86_64-pc-windows-elf";
68
+ // Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
69
+ // Override it to use the ELF format to support DWARF debug info, but keep using the
70
+ // Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
71
+ static const char* target_triple = "x86_64-pc-windows-elf";
70
72
  #else
71
- static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
73
+ static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
72
74
  #endif
73
75
 
74
76
  static void initialize_llvm()
@@ -93,6 +95,11 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
93
95
  args.push_back("-triple");
94
96
  args.push_back(target_triple);
95
97
 
98
+ #if defined(__x86_64__) || defined(_M_X64)
99
+ args.push_back("-target-feature");
100
+ args.push_back("+f16c"); // Enables support for _Float16
101
+ #endif
102
+
96
103
  clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
97
104
  std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
98
105
  std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
@@ -114,8 +121,6 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
114
121
  std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
115
122
  compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
116
123
 
117
- compiler_instance.getPreprocessorOpts().addMacroDef("WP_CPU");
118
-
119
124
  if(!debug)
120
125
  {
121
126
  compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
@@ -133,18 +138,71 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
133
138
  return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
134
139
  }
135
140
 
136
- extern "C" {
137
-
138
- WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char* output_file, bool debug)
141
+ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
139
142
  {
140
- #if defined (_WIN32)
141
- const char* obj_ext = ".obj";
142
- #else
143
- const char* obj_ext = ".o";
144
- #endif
143
+ // Compilation arguments
144
+ std::vector<const char*> args;
145
+ args.push_back(input_file.c_str());
146
+
147
+ args.push_back("-I");
148
+ args.push_back(include_dir);
149
+
150
+ args.push_back(debug ? "-O0" : "-O2");
151
+
152
+ args.push_back("-triple");
153
+ args.push_back("nvptx64-nvidia-cuda");
154
+
155
+ args.push_back("-target-cpu");
156
+ args.push_back("sm_70");
157
+
158
+ clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
159
+ std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
160
+ std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
161
+ clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
162
+ std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
163
+ std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
164
+
165
+ clang::CompilerInstance compiler_instance;
166
+
167
+ auto& compiler_invocation = compiler_instance.getInvocation();
168
+ clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
169
+
170
+ if(debug)
171
+ {
172
+ compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
173
+ }
174
+
175
+ // Map code to a MemoryBuffer
176
+ std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
177
+ compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
178
+
179
+ // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
180
+ // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
181
+ // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
182
+ compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
183
+
184
+ if(!debug)
185
+ {
186
+ compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
187
+ }
188
+
189
+ compiler_instance.getLangOpts().CUDA = 1;
190
+ compiler_instance.getLangOpts().CUDAIsDevice = 1;
191
+ compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
192
+
193
+ compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
145
194
 
146
- std::string input_file = std::string(output_file).substr(0, std::strlen(output_file) - std::strlen(obj_ext));
195
+ clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
196
+ bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
197
+ buffer.release();
198
+
199
+ return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
200
+ }
147
201
 
202
+ extern "C" {
203
+
204
+ WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
205
+ {
148
206
  initialize_llvm();
149
207
 
150
208
  llvm::LLVMContext context;
@@ -155,13 +213,13 @@ WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char*
155
213
  return -1;
156
214
  }
157
215
 
158
- std::string Error;
159
- const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, Error);
216
+ std::string error;
217
+ const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
160
218
 
161
219
  const char* CPU = "generic";
162
220
  const char* features = "";
163
221
  llvm::TargetOptions target_options;
164
- llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // DLLs need Position Independent Code
222
+ llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code
165
223
  llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes
166
224
  llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
167
225
 
@@ -182,6 +240,59 @@ WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char*
182
240
  return 0;
183
241
  }
184
242
 
243
+ WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
244
+ {
245
+ initialize_llvm();
246
+
247
+ llvm::LLVMContext context;
248
+ std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);
249
+
250
+ if(!module)
251
+ {
252
+ return -1;
253
+ }
254
+
255
+ std::string error;
256
+ const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
257
+
258
+ const char* CPU = "sm_70";
259
+ const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5
260
+ llvm::TargetOptions target_options;
261
+ llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
262
+ llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
263
+
264
+ module->setDataLayout(target_machine->createDataLayout());
265
+
266
+ // Link libdevice
267
+ llvm::SMDiagnostic diagnostic;
268
+ std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc";
269
+ std::unique_ptr<llvm::Module> libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context));
270
+ if(!libdevice)
271
+ {
272
+ return -1;
273
+ }
274
+
275
+ llvm::Linker linker(*module.get());
276
+ if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true)
277
+ {
278
+ return -1;
279
+ }
280
+
281
+ std::error_code error_code;
282
+ llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);
283
+
284
+ llvm::legacy::PassManager pass_manager;
285
+ llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile;
286
+ target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);
287
+
288
+ pass_manager.run(*module);
289
+ output.flush();
290
+
291
+ delete target_machine;
292
+
293
+ return 0;
294
+ }
295
+
185
296
  // Global JIT instance
186
297
  static llvm::orc::LLJIT* jit = nullptr;
187
298
 
@@ -248,6 +359,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
248
359
  SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)),
249
360
  SYMBOL(expf), SYMBOL_T(exp, double(*)(double)),
250
361
  SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)),
362
+ SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)),
251
363
  SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)),
252
364
  SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)),
253
365
  SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)),
@@ -276,7 +388,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
276
388
  SYMBOL(__chkstk),
277
389
  #elif defined(__APPLE__)
278
390
  SYMBOL(__bzero),
279
- SYMBOL(__sincos_stret),
391
+ SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
280
392
  #else
281
393
  SYMBOL(sincosf), SYMBOL_T(sincos, void(*)(double,double*,double*)),
282
394
  #endif
@@ -335,7 +447,7 @@ WP_API uint64_t lookup(const char* dll_name, const char* function_name)
335
447
  if(!func)
336
448
  {
337
449
  std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl;
338
- return -1;
450
+ return 0;
339
451
  }
340
452
 
341
453
  return func->getValue();
warp/native/crt.cpp CHANGED
@@ -29,79 +29,4 @@ extern "C" WP_API void _wp_assert(const char* expression, const char* file, unsi
29
29
  // Now invoke the standard assert(), which may abort the program or break
30
30
  // into the debugger as decided by the runtime environment.
31
31
  assert(false && "assert() failed");
32
- }
33
-
34
- // Export CRT symbols from warp.dll for use by compute kernel DLLs
35
- // These are declared in crt.h
36
- #if defined(_MSC_VER)
37
-
38
- #pragma comment(linker,"/export:printf")
39
-
40
- #pragma comment(linker,"/export:abs")
41
- #pragma comment(linker,"/export:llabs")
42
-
43
- #pragma comment(linker,"/export:fmodf")
44
- #pragma comment(linker,"/export:fmod")
45
- #pragma comment(linker,"/export:logf")
46
- #pragma comment(linker,"/export:log")
47
- #pragma comment(linker,"/export:log2f")
48
- #pragma comment(linker,"/export:log2")
49
- #pragma comment(linker,"/export:log10f")
50
- #pragma comment(linker,"/export:log10")
51
- #pragma comment(linker,"/export:expf")
52
- #pragma comment(linker,"/export:exp")
53
- #pragma comment(linker,"/export:sqrtf")
54
- #pragma comment(linker,"/export:sqrt")
55
- #pragma comment(linker,"/export:powf")
56
- #pragma comment(linker,"/export:pow")
57
- #pragma comment(linker,"/export:floorf")
58
- #pragma comment(linker,"/export:floor")
59
- #pragma comment(linker,"/export:ceilf")
60
- #pragma comment(linker,"/export:ceil")
61
- #pragma comment(linker,"/export:fabsf")
62
- #pragma comment(linker,"/export:fabs")
63
- #pragma comment(linker,"/export:roundf")
64
- #pragma comment(linker,"/export:round")
65
- #pragma comment(linker,"/export:truncf")
66
- #pragma comment(linker,"/export:trunc")
67
- #pragma comment(linker,"/export:rintf")
68
- #pragma comment(linker,"/export:rint")
69
- #pragma comment(linker,"/export:acosf")
70
- #pragma comment(linker,"/export:acos")
71
- #pragma comment(linker,"/export:asinf")
72
- #pragma comment(linker,"/export:asin")
73
- #pragma comment(linker,"/export:atanf")
74
- #pragma comment(linker,"/export:atan")
75
- #pragma comment(linker,"/export:atan2f")
76
- #pragma comment(linker,"/export:atan2")
77
- #pragma comment(linker,"/export:cosf")
78
- #pragma comment(linker,"/export:cos")
79
- #pragma comment(linker,"/export:sinf")
80
- #pragma comment(linker,"/export:sin")
81
- #pragma comment(linker,"/export:tanf")
82
- #pragma comment(linker,"/export:tan")
83
- #pragma comment(linker,"/export:sinhf")
84
- #pragma comment(linker,"/export:sinh")
85
- #pragma comment(linker,"/export:coshf")
86
- #pragma comment(linker,"/export:cosh")
87
- #pragma comment(linker,"/export:tanhf")
88
- #pragma comment(linker,"/export:tanh")
89
- #pragma comment(linker,"/export:fmaf")
90
-
91
- #pragma comment(linker,"/export:memset")
92
- #pragma comment(linker,"/export:memcpy")
93
-
94
- #pragma comment(linker,"/export:_wp_isfinite")
95
- #pragma comment(linker,"/export:_wp_assert")
96
-
97
- // For functions with large stack frames the MSVC compiler will emit a call to
98
- // __chkstk() to linearly touch each memory page. This grows the stack without
99
- // triggering the stack overflow guards.
100
- #pragma comment(linker,"/export:__chkstk")
101
-
102
- // The MSVC linker checks for the _fltused symbol if any floating-point
103
- // functionality is used. It's defined by the Microsoft CRT to indicate that
104
- // the x87 FPU control word was properly initialized.
105
- #pragma comment(linker,"/export:_fltused")
106
-
107
- #endif // _MSC_VER
32
+ }
warp/native/crt.h CHANGED
@@ -30,15 +30,15 @@
30
30
  #define WP_API
31
31
  #endif
32
32
 
33
- extern "C" {
33
+ #if !defined(__CUDA_ARCH__)
34
34
 
35
35
  // Helper for implementing assert() macro
36
- WP_API void _wp_assert(const char* message, const char* file, unsigned int line);
36
+ extern "C" WP_API void _wp_assert(const char* message, const char* file, unsigned int line);
37
37
 
38
38
  // Helper for implementing isfinite()
39
- WP_API int _wp_isfinite(double);
39
+ extern "C" WP_API int _wp_isfinite(double);
40
40
 
41
- } // extern "C"
41
+ #endif // !__CUDA_ARCH__
42
42
 
43
43
  #if !defined(WP_NO_CRT)
44
44
 
@@ -52,106 +52,6 @@ WP_API int _wp_isfinite(double);
52
52
 
53
53
  #else
54
54
 
55
- #if defined(__CUDACC__)
56
-
57
- // stdio.h
58
- extern "C" __device__ int printf(const char* format, ... );
59
-
60
- #else
61
-
62
- extern "C" {
63
-
64
- // stdio.h
65
- int printf(const char * format, ... );
66
-
67
- // stdlib.h
68
- int abs(int);
69
- long long llabs(long long);
70
-
71
- // math.h
72
- float fmodf(float, float);
73
- double fmod(double, double);
74
- float logf(float);
75
- double log(double);
76
- float log2f(float);
77
- double log2(double);
78
- float log10f(float);
79
- double log10(double);
80
- float expf(float);
81
- double exp(double);
82
- float sqrtf(float);
83
- double sqrt(double);
84
- float powf(float, float);
85
- double pow(double, double);
86
- float floorf(float);
87
- double floor(double);
88
- float ceilf(float);
89
- double ceil(double);
90
- float fabsf(float);
91
- double fabs(double);
92
- float roundf(float);
93
- double round(double);
94
- float truncf(float);
95
- double trunc(double);
96
- float rintf(float);
97
- double rint(double);
98
- float acosf(float);
99
- double acos(double);
100
- float asinf(float);
101
- double asin(double);
102
- float atanf(float);
103
- double atan(double);
104
- float atan2f(float, float);
105
- double atan2(double, double);
106
- float cosf(float);
107
- double cos(double);
108
- float sinf(float);
109
- double sin(double);
110
- float tanf(float);
111
- double tan(double);
112
- float sinhf(float);
113
- double sinh(double);
114
- float coshf(float);
115
- double cosh(double);
116
- float tanhf(float);
117
- double tanh(double);
118
- float fmaf(float, float, float);
119
-
120
- // stddef.h
121
- #if defined(_WIN32)
122
- using size_t = unsigned __int64;
123
- #else
124
- using size_t = unsigned long;
125
- #endif
126
-
127
- // string.h
128
- void* memset(void*, int, size_t);
129
- void* memcpy(void*, const void*, size_t);
130
-
131
- // stdlib.h
132
- void* malloc(size_t);
133
- void free(void*);
134
-
135
- } // extern "C"
136
-
137
- // cmath
138
- inline bool isfinite(double x)
139
- {
140
- return _wp_isfinite(x);
141
- }
142
-
143
- // assert.h
144
- #ifdef NDEBUG
145
- #define assert(expression) ((void)0)
146
- #else
147
- #define assert(expression) (void)( \
148
- (!!(expression)) || \
149
- (_wp_assert((#expression), (__FILE__), (unsigned)(__LINE__)), 0) \
150
- )
151
- #endif
152
-
153
- #endif // !__CUDACC__
154
-
155
55
  // These definitions are taken from Jitify: https://github.com/NVIDIA/jitify
156
56
 
157
57
  /// float.h
@@ -221,6 +121,9 @@ enum {
221
121
  #define LLONG_MIN (-LLONG_MAX - 1LL)
222
122
  #define ULLONG_MAX 18446744073709551615ULL
223
123
 
124
+ #define INFINITY ((float)(DBL_MAX * DBL_MAX))
125
+ #define HUGE_VAL ((double)INFINITY)
126
+ #define HUGE_VALF ((float)INFINITY)
224
127
 
225
128
  /// stdint.h
226
129
  typedef signed char int8_t;
@@ -325,4 +228,108 @@ typedef unsigned long long uint64_t;
325
228
 
326
229
  #define M_PI 3.14159265358979323846
327
230
 
231
+ #if defined(__CUDACC__)
232
+
233
+ #if defined(__clang__)
234
+ // When compiling CUDA with barebones Clang we need to define its builtins and runtime functions ourselves.
235
+ #include "cuda_crt.h"
236
+ #endif
237
+
238
+ #else
239
+
240
+ extern "C" {
241
+
242
+ // stdio.h
243
+ int printf(const char * format, ... );
244
+
245
+ // stdlib.h
246
+ int abs(int);
247
+ long long llabs(long long);
248
+
249
+ // math.h
250
+ float fmodf(float, float);
251
+ double fmod(double, double);
252
+ float logf(float);
253
+ double log(double);
254
+ float log2f(float);
255
+ double log2(double);
256
+ float log10f(float);
257
+ double log10(double);
258
+ float expf(float);
259
+ double exp(double);
260
+ float sqrtf(float);
261
+ double sqrt(double);
262
+ float cbrtf(float);
263
+ double cbrt(double);
264
+ float powf(float, float);
265
+ double pow(double, double);
266
+ float floorf(float);
267
+ double floor(double);
268
+ float ceilf(float);
269
+ double ceil(double);
270
+ float fabsf(float);
271
+ double fabs(double);
272
+ float roundf(float);
273
+ double round(double);
274
+ float truncf(float);
275
+ double trunc(double);
276
+ float rintf(float);
277
+ double rint(double);
278
+ float acosf(float);
279
+ double acos(double);
280
+ float asinf(float);
281
+ double asin(double);
282
+ float atanf(float);
283
+ double atan(double);
284
+ float atan2f(float, float);
285
+ double atan2(double, double);
286
+ float cosf(float);
287
+ double cos(double);
288
+ float sinf(float);
289
+ double sin(double);
290
+ float tanf(float);
291
+ double tan(double);
292
+ float sinhf(float);
293
+ double sinh(double);
294
+ float coshf(float);
295
+ double cosh(double);
296
+ float tanhf(float);
297
+ double tanh(double);
298
+ float fmaf(float, float, float);
299
+
300
+ // stddef.h
301
+ #if defined(_WIN32)
302
+ using size_t = unsigned __int64;
303
+ #else
304
+ using size_t = unsigned long;
305
+ #endif
306
+
307
+ // string.h
308
+ void* memset(void*, int, size_t);
309
+ void* memcpy(void*, const void*, size_t);
310
+
311
+ // stdlib.h
312
+ void* malloc(size_t);
313
+ void free(void*);
314
+
315
+ } // extern "C"
316
+
317
+ // cmath
318
+ inline bool isfinite(double x)
319
+ {
320
+ return _wp_isfinite(x);
321
+ }
322
+
323
+ // assert.h
324
+ #ifdef NDEBUG
325
+ #define assert(expression) ((void)0)
326
+ #else
327
+ #define assert(expression) (void)( \
328
+ (!!(expression)) || \
329
+ (_wp_assert((#expression), (__FILE__), (unsigned)(__LINE__)), 0) \
330
+ )
331
+ #endif
332
+
333
+ #endif // !__CUDACC__
334
+
328
335
  #endif // WP_NO_CRT