warp-lang 1.7.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (429) hide show
  1. warp/__init__.py +139 -0
  2. warp/__init__.pyi +1 -0
  3. warp/autograd.py +1142 -0
  4. warp/bin/warp-clang.so +0 -0
  5. warp/bin/warp.so +0 -0
  6. warp/build.py +557 -0
  7. warp/build_dll.py +405 -0
  8. warp/builtins.py +6855 -0
  9. warp/codegen.py +3969 -0
  10. warp/config.py +158 -0
  11. warp/constants.py +57 -0
  12. warp/context.py +6812 -0
  13. warp/dlpack.py +462 -0
  14. warp/examples/__init__.py +24 -0
  15. warp/examples/assets/bear.usd +0 -0
  16. warp/examples/assets/bunny.usd +0 -0
  17. warp/examples/assets/cartpole.urdf +110 -0
  18. warp/examples/assets/crazyflie.usd +0 -0
  19. warp/examples/assets/cube.usd +0 -0
  20. warp/examples/assets/nonuniform.usd +0 -0
  21. warp/examples/assets/nv_ant.xml +92 -0
  22. warp/examples/assets/nv_humanoid.xml +183 -0
  23. warp/examples/assets/nvidia_logo.png +0 -0
  24. warp/examples/assets/pixel.jpg +0 -0
  25. warp/examples/assets/quadruped.urdf +268 -0
  26. warp/examples/assets/rocks.nvdb +0 -0
  27. warp/examples/assets/rocks.usd +0 -0
  28. warp/examples/assets/sphere.usd +0 -0
  29. warp/examples/assets/square_cloth.usd +0 -0
  30. warp/examples/benchmarks/benchmark_api.py +389 -0
  31. warp/examples/benchmarks/benchmark_cloth.py +296 -0
  32. warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
  33. warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
  34. warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
  35. warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
  36. warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
  37. warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
  38. warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
  39. warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
  40. warp/examples/benchmarks/benchmark_gemm.py +164 -0
  41. warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
  42. warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
  43. warp/examples/benchmarks/benchmark_launches.py +301 -0
  44. warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
  45. warp/examples/browse.py +37 -0
  46. warp/examples/core/example_cupy.py +86 -0
  47. warp/examples/core/example_dem.py +241 -0
  48. warp/examples/core/example_fluid.py +299 -0
  49. warp/examples/core/example_graph_capture.py +150 -0
  50. warp/examples/core/example_marching_cubes.py +194 -0
  51. warp/examples/core/example_mesh.py +180 -0
  52. warp/examples/core/example_mesh_intersect.py +211 -0
  53. warp/examples/core/example_nvdb.py +182 -0
  54. warp/examples/core/example_raycast.py +111 -0
  55. warp/examples/core/example_raymarch.py +205 -0
  56. warp/examples/core/example_render_opengl.py +193 -0
  57. warp/examples/core/example_sample_mesh.py +300 -0
  58. warp/examples/core/example_sph.py +411 -0
  59. warp/examples/core/example_torch.py +211 -0
  60. warp/examples/core/example_wave.py +269 -0
  61. warp/examples/fem/example_adaptive_grid.py +286 -0
  62. warp/examples/fem/example_apic_fluid.py +423 -0
  63. warp/examples/fem/example_burgers.py +261 -0
  64. warp/examples/fem/example_convection_diffusion.py +178 -0
  65. warp/examples/fem/example_convection_diffusion_dg.py +204 -0
  66. warp/examples/fem/example_deformed_geometry.py +172 -0
  67. warp/examples/fem/example_diffusion.py +196 -0
  68. warp/examples/fem/example_diffusion_3d.py +225 -0
  69. warp/examples/fem/example_diffusion_mgpu.py +220 -0
  70. warp/examples/fem/example_distortion_energy.py +228 -0
  71. warp/examples/fem/example_magnetostatics.py +240 -0
  72. warp/examples/fem/example_mixed_elasticity.py +291 -0
  73. warp/examples/fem/example_navier_stokes.py +261 -0
  74. warp/examples/fem/example_nonconforming_contact.py +298 -0
  75. warp/examples/fem/example_stokes.py +213 -0
  76. warp/examples/fem/example_stokes_transfer.py +262 -0
  77. warp/examples/fem/example_streamlines.py +352 -0
  78. warp/examples/fem/utils.py +1000 -0
  79. warp/examples/interop/example_jax_callable.py +116 -0
  80. warp/examples/interop/example_jax_ffi_callback.py +132 -0
  81. warp/examples/interop/example_jax_kernel.py +205 -0
  82. warp/examples/optim/example_bounce.py +266 -0
  83. warp/examples/optim/example_cloth_throw.py +228 -0
  84. warp/examples/optim/example_diffray.py +561 -0
  85. warp/examples/optim/example_drone.py +870 -0
  86. warp/examples/optim/example_fluid_checkpoint.py +497 -0
  87. warp/examples/optim/example_inverse_kinematics.py +182 -0
  88. warp/examples/optim/example_inverse_kinematics_torch.py +191 -0
  89. warp/examples/optim/example_softbody_properties.py +400 -0
  90. warp/examples/optim/example_spring_cage.py +245 -0
  91. warp/examples/optim/example_trajectory.py +227 -0
  92. warp/examples/sim/example_cartpole.py +143 -0
  93. warp/examples/sim/example_cloth.py +225 -0
  94. warp/examples/sim/example_cloth_self_contact.py +322 -0
  95. warp/examples/sim/example_granular.py +130 -0
  96. warp/examples/sim/example_granular_collision_sdf.py +202 -0
  97. warp/examples/sim/example_jacobian_ik.py +244 -0
  98. warp/examples/sim/example_particle_chain.py +124 -0
  99. warp/examples/sim/example_quadruped.py +203 -0
  100. warp/examples/sim/example_rigid_chain.py +203 -0
  101. warp/examples/sim/example_rigid_contact.py +195 -0
  102. warp/examples/sim/example_rigid_force.py +133 -0
  103. warp/examples/sim/example_rigid_gyroscopic.py +115 -0
  104. warp/examples/sim/example_rigid_soft_contact.py +140 -0
  105. warp/examples/sim/example_soft_body.py +196 -0
  106. warp/examples/tile/example_tile_cholesky.py +87 -0
  107. warp/examples/tile/example_tile_convolution.py +66 -0
  108. warp/examples/tile/example_tile_fft.py +55 -0
  109. warp/examples/tile/example_tile_filtering.py +113 -0
  110. warp/examples/tile/example_tile_matmul.py +85 -0
  111. warp/examples/tile/example_tile_mlp.py +383 -0
  112. warp/examples/tile/example_tile_nbody.py +199 -0
  113. warp/examples/tile/example_tile_walker.py +327 -0
  114. warp/fabric.py +355 -0
  115. warp/fem/__init__.py +106 -0
  116. warp/fem/adaptivity.py +508 -0
  117. warp/fem/cache.py +572 -0
  118. warp/fem/dirichlet.py +202 -0
  119. warp/fem/domain.py +411 -0
  120. warp/fem/field/__init__.py +125 -0
  121. warp/fem/field/field.py +619 -0
  122. warp/fem/field/nodal_field.py +326 -0
  123. warp/fem/field/restriction.py +37 -0
  124. warp/fem/field/virtual.py +848 -0
  125. warp/fem/geometry/__init__.py +32 -0
  126. warp/fem/geometry/adaptive_nanogrid.py +857 -0
  127. warp/fem/geometry/closest_point.py +84 -0
  128. warp/fem/geometry/deformed_geometry.py +221 -0
  129. warp/fem/geometry/element.py +776 -0
  130. warp/fem/geometry/geometry.py +362 -0
  131. warp/fem/geometry/grid_2d.py +392 -0
  132. warp/fem/geometry/grid_3d.py +452 -0
  133. warp/fem/geometry/hexmesh.py +911 -0
  134. warp/fem/geometry/nanogrid.py +571 -0
  135. warp/fem/geometry/partition.py +389 -0
  136. warp/fem/geometry/quadmesh.py +663 -0
  137. warp/fem/geometry/tetmesh.py +855 -0
  138. warp/fem/geometry/trimesh.py +806 -0
  139. warp/fem/integrate.py +2335 -0
  140. warp/fem/linalg.py +419 -0
  141. warp/fem/operator.py +293 -0
  142. warp/fem/polynomial.py +229 -0
  143. warp/fem/quadrature/__init__.py +17 -0
  144. warp/fem/quadrature/pic_quadrature.py +299 -0
  145. warp/fem/quadrature/quadrature.py +591 -0
  146. warp/fem/space/__init__.py +228 -0
  147. warp/fem/space/basis_function_space.py +468 -0
  148. warp/fem/space/basis_space.py +667 -0
  149. warp/fem/space/dof_mapper.py +251 -0
  150. warp/fem/space/function_space.py +309 -0
  151. warp/fem/space/grid_2d_function_space.py +177 -0
  152. warp/fem/space/grid_3d_function_space.py +227 -0
  153. warp/fem/space/hexmesh_function_space.py +257 -0
  154. warp/fem/space/nanogrid_function_space.py +201 -0
  155. warp/fem/space/partition.py +367 -0
  156. warp/fem/space/quadmesh_function_space.py +223 -0
  157. warp/fem/space/restriction.py +179 -0
  158. warp/fem/space/shape/__init__.py +143 -0
  159. warp/fem/space/shape/cube_shape_function.py +1105 -0
  160. warp/fem/space/shape/shape_function.py +133 -0
  161. warp/fem/space/shape/square_shape_function.py +926 -0
  162. warp/fem/space/shape/tet_shape_function.py +834 -0
  163. warp/fem/space/shape/triangle_shape_function.py +672 -0
  164. warp/fem/space/tetmesh_function_space.py +271 -0
  165. warp/fem/space/topology.py +424 -0
  166. warp/fem/space/trimesh_function_space.py +194 -0
  167. warp/fem/types.py +99 -0
  168. warp/fem/utils.py +420 -0
  169. warp/jax.py +187 -0
  170. warp/jax_experimental/__init__.py +16 -0
  171. warp/jax_experimental/custom_call.py +351 -0
  172. warp/jax_experimental/ffi.py +698 -0
  173. warp/jax_experimental/xla_ffi.py +602 -0
  174. warp/math.py +244 -0
  175. warp/native/array.h +1145 -0
  176. warp/native/builtin.h +1800 -0
  177. warp/native/bvh.cpp +492 -0
  178. warp/native/bvh.cu +791 -0
  179. warp/native/bvh.h +554 -0
  180. warp/native/clang/clang.cpp +536 -0
  181. warp/native/coloring.cpp +613 -0
  182. warp/native/crt.cpp +51 -0
  183. warp/native/crt.h +362 -0
  184. warp/native/cuda_crt.h +1058 -0
  185. warp/native/cuda_util.cpp +646 -0
  186. warp/native/cuda_util.h +307 -0
  187. warp/native/error.cpp +77 -0
  188. warp/native/error.h +36 -0
  189. warp/native/exports.h +1878 -0
  190. warp/native/fabric.h +245 -0
  191. warp/native/hashgrid.cpp +311 -0
  192. warp/native/hashgrid.cu +87 -0
  193. warp/native/hashgrid.h +240 -0
  194. warp/native/initializer_array.h +41 -0
  195. warp/native/intersect.h +1230 -0
  196. warp/native/intersect_adj.h +375 -0
  197. warp/native/intersect_tri.h +339 -0
  198. warp/native/marching.cpp +19 -0
  199. warp/native/marching.cu +514 -0
  200. warp/native/marching.h +19 -0
  201. warp/native/mat.h +2220 -0
  202. warp/native/mathdx.cpp +87 -0
  203. warp/native/matnn.h +343 -0
  204. warp/native/mesh.cpp +266 -0
  205. warp/native/mesh.cu +404 -0
  206. warp/native/mesh.h +1980 -0
  207. warp/native/nanovdb/GridHandle.h +366 -0
  208. warp/native/nanovdb/HostBuffer.h +590 -0
  209. warp/native/nanovdb/NanoVDB.h +6624 -0
  210. warp/native/nanovdb/PNanoVDB.h +3390 -0
  211. warp/native/noise.h +859 -0
  212. warp/native/quat.h +1371 -0
  213. warp/native/rand.h +342 -0
  214. warp/native/range.h +139 -0
  215. warp/native/reduce.cpp +174 -0
  216. warp/native/reduce.cu +364 -0
  217. warp/native/runlength_encode.cpp +79 -0
  218. warp/native/runlength_encode.cu +61 -0
  219. warp/native/scan.cpp +47 -0
  220. warp/native/scan.cu +53 -0
  221. warp/native/scan.h +23 -0
  222. warp/native/solid_angle.h +466 -0
  223. warp/native/sort.cpp +251 -0
  224. warp/native/sort.cu +277 -0
  225. warp/native/sort.h +33 -0
  226. warp/native/sparse.cpp +378 -0
  227. warp/native/sparse.cu +524 -0
  228. warp/native/spatial.h +657 -0
  229. warp/native/svd.h +702 -0
  230. warp/native/temp_buffer.h +46 -0
  231. warp/native/tile.h +2584 -0
  232. warp/native/tile_reduce.h +264 -0
  233. warp/native/vec.h +1426 -0
  234. warp/native/volume.cpp +501 -0
  235. warp/native/volume.cu +67 -0
  236. warp/native/volume.h +969 -0
  237. warp/native/volume_builder.cu +477 -0
  238. warp/native/volume_builder.h +52 -0
  239. warp/native/volume_impl.h +70 -0
  240. warp/native/warp.cpp +1082 -0
  241. warp/native/warp.cu +3636 -0
  242. warp/native/warp.h +381 -0
  243. warp/optim/__init__.py +17 -0
  244. warp/optim/adam.py +163 -0
  245. warp/optim/linear.py +1137 -0
  246. warp/optim/sgd.py +112 -0
  247. warp/paddle.py +407 -0
  248. warp/render/__init__.py +18 -0
  249. warp/render/render_opengl.py +3518 -0
  250. warp/render/render_usd.py +784 -0
  251. warp/render/utils.py +160 -0
  252. warp/sim/__init__.py +65 -0
  253. warp/sim/articulation.py +793 -0
  254. warp/sim/collide.py +2395 -0
  255. warp/sim/graph_coloring.py +300 -0
  256. warp/sim/import_mjcf.py +790 -0
  257. warp/sim/import_snu.py +227 -0
  258. warp/sim/import_urdf.py +579 -0
  259. warp/sim/import_usd.py +894 -0
  260. warp/sim/inertia.py +324 -0
  261. warp/sim/integrator.py +242 -0
  262. warp/sim/integrator_euler.py +1997 -0
  263. warp/sim/integrator_featherstone.py +2101 -0
  264. warp/sim/integrator_vbd.py +2048 -0
  265. warp/sim/integrator_xpbd.py +3292 -0
  266. warp/sim/model.py +4791 -0
  267. warp/sim/particles.py +121 -0
  268. warp/sim/render.py +427 -0
  269. warp/sim/utils.py +428 -0
  270. warp/sparse.py +2057 -0
  271. warp/stubs.py +3333 -0
  272. warp/tape.py +1203 -0
  273. warp/tests/__init__.py +1 -0
  274. warp/tests/__main__.py +4 -0
  275. warp/tests/assets/curlnoise_golden.npy +0 -0
  276. warp/tests/assets/mlp_golden.npy +0 -0
  277. warp/tests/assets/pixel.npy +0 -0
  278. warp/tests/assets/pnoise_golden.npy +0 -0
  279. warp/tests/assets/spiky.usd +0 -0
  280. warp/tests/assets/test_grid.nvdb +0 -0
  281. warp/tests/assets/test_index_grid.nvdb +0 -0
  282. warp/tests/assets/test_int32_grid.nvdb +0 -0
  283. warp/tests/assets/test_vec_grid.nvdb +0 -0
  284. warp/tests/assets/torus.nvdb +0 -0
  285. warp/tests/assets/torus.usda +105 -0
  286. warp/tests/aux_test_class_kernel.py +34 -0
  287. warp/tests/aux_test_compile_consts_dummy.py +18 -0
  288. warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
  289. warp/tests/aux_test_dependent.py +29 -0
  290. warp/tests/aux_test_grad_customs.py +29 -0
  291. warp/tests/aux_test_instancing_gc.py +26 -0
  292. warp/tests/aux_test_module_unload.py +23 -0
  293. warp/tests/aux_test_name_clash1.py +40 -0
  294. warp/tests/aux_test_name_clash2.py +40 -0
  295. warp/tests/aux_test_reference.py +9 -0
  296. warp/tests/aux_test_reference_reference.py +8 -0
  297. warp/tests/aux_test_square.py +16 -0
  298. warp/tests/aux_test_unresolved_func.py +22 -0
  299. warp/tests/aux_test_unresolved_symbol.py +22 -0
  300. warp/tests/cuda/__init__.py +0 -0
  301. warp/tests/cuda/test_async.py +676 -0
  302. warp/tests/cuda/test_ipc.py +124 -0
  303. warp/tests/cuda/test_mempool.py +233 -0
  304. warp/tests/cuda/test_multigpu.py +169 -0
  305. warp/tests/cuda/test_peer.py +139 -0
  306. warp/tests/cuda/test_pinned.py +84 -0
  307. warp/tests/cuda/test_streams.py +634 -0
  308. warp/tests/geometry/__init__.py +0 -0
  309. warp/tests/geometry/test_bvh.py +200 -0
  310. warp/tests/geometry/test_hash_grid.py +221 -0
  311. warp/tests/geometry/test_marching_cubes.py +74 -0
  312. warp/tests/geometry/test_mesh.py +316 -0
  313. warp/tests/geometry/test_mesh_query_aabb.py +399 -0
  314. warp/tests/geometry/test_mesh_query_point.py +932 -0
  315. warp/tests/geometry/test_mesh_query_ray.py +311 -0
  316. warp/tests/geometry/test_volume.py +1103 -0
  317. warp/tests/geometry/test_volume_write.py +346 -0
  318. warp/tests/interop/__init__.py +0 -0
  319. warp/tests/interop/test_dlpack.py +729 -0
  320. warp/tests/interop/test_jax.py +371 -0
  321. warp/tests/interop/test_paddle.py +800 -0
  322. warp/tests/interop/test_torch.py +1001 -0
  323. warp/tests/run_coverage_serial.py +39 -0
  324. warp/tests/sim/__init__.py +0 -0
  325. warp/tests/sim/disabled_kinematics.py +244 -0
  326. warp/tests/sim/flaky_test_sim_grad.py +290 -0
  327. warp/tests/sim/test_collision.py +604 -0
  328. warp/tests/sim/test_coloring.py +258 -0
  329. warp/tests/sim/test_model.py +224 -0
  330. warp/tests/sim/test_sim_grad_bounce_linear.py +212 -0
  331. warp/tests/sim/test_sim_kinematics.py +98 -0
  332. warp/tests/sim/test_vbd.py +597 -0
  333. warp/tests/test_adam.py +163 -0
  334. warp/tests/test_arithmetic.py +1096 -0
  335. warp/tests/test_array.py +2972 -0
  336. warp/tests/test_array_reduce.py +156 -0
  337. warp/tests/test_assert.py +250 -0
  338. warp/tests/test_atomic.py +153 -0
  339. warp/tests/test_bool.py +220 -0
  340. warp/tests/test_builtins_resolution.py +1298 -0
  341. warp/tests/test_closest_point_edge_edge.py +327 -0
  342. warp/tests/test_codegen.py +810 -0
  343. warp/tests/test_codegen_instancing.py +1495 -0
  344. warp/tests/test_compile_consts.py +215 -0
  345. warp/tests/test_conditional.py +252 -0
  346. warp/tests/test_context.py +42 -0
  347. warp/tests/test_copy.py +238 -0
  348. warp/tests/test_ctypes.py +638 -0
  349. warp/tests/test_dense.py +73 -0
  350. warp/tests/test_devices.py +97 -0
  351. warp/tests/test_examples.py +482 -0
  352. warp/tests/test_fabricarray.py +996 -0
  353. warp/tests/test_fast_math.py +74 -0
  354. warp/tests/test_fem.py +2003 -0
  355. warp/tests/test_fp16.py +136 -0
  356. warp/tests/test_func.py +454 -0
  357. warp/tests/test_future_annotations.py +98 -0
  358. warp/tests/test_generics.py +656 -0
  359. warp/tests/test_grad.py +893 -0
  360. warp/tests/test_grad_customs.py +339 -0
  361. warp/tests/test_grad_debug.py +341 -0
  362. warp/tests/test_implicit_init.py +411 -0
  363. warp/tests/test_import.py +45 -0
  364. warp/tests/test_indexedarray.py +1140 -0
  365. warp/tests/test_intersect.py +73 -0
  366. warp/tests/test_iter.py +76 -0
  367. warp/tests/test_large.py +177 -0
  368. warp/tests/test_launch.py +411 -0
  369. warp/tests/test_lerp.py +151 -0
  370. warp/tests/test_linear_solvers.py +193 -0
  371. warp/tests/test_lvalue.py +427 -0
  372. warp/tests/test_mat.py +2089 -0
  373. warp/tests/test_mat_lite.py +122 -0
  374. warp/tests/test_mat_scalar_ops.py +2913 -0
  375. warp/tests/test_math.py +178 -0
  376. warp/tests/test_mlp.py +282 -0
  377. warp/tests/test_module_hashing.py +258 -0
  378. warp/tests/test_modules_lite.py +44 -0
  379. warp/tests/test_noise.py +252 -0
  380. warp/tests/test_operators.py +299 -0
  381. warp/tests/test_options.py +129 -0
  382. warp/tests/test_overwrite.py +551 -0
  383. warp/tests/test_print.py +339 -0
  384. warp/tests/test_quat.py +2315 -0
  385. warp/tests/test_rand.py +339 -0
  386. warp/tests/test_reload.py +302 -0
  387. warp/tests/test_rounding.py +185 -0
  388. warp/tests/test_runlength_encode.py +196 -0
  389. warp/tests/test_scalar_ops.py +105 -0
  390. warp/tests/test_smoothstep.py +108 -0
  391. warp/tests/test_snippet.py +318 -0
  392. warp/tests/test_sparse.py +582 -0
  393. warp/tests/test_spatial.py +2229 -0
  394. warp/tests/test_special_values.py +361 -0
  395. warp/tests/test_static.py +592 -0
  396. warp/tests/test_struct.py +734 -0
  397. warp/tests/test_tape.py +204 -0
  398. warp/tests/test_transient_module.py +93 -0
  399. warp/tests/test_triangle_closest_point.py +145 -0
  400. warp/tests/test_types.py +562 -0
  401. warp/tests/test_utils.py +588 -0
  402. warp/tests/test_vec.py +1487 -0
  403. warp/tests/test_vec_lite.py +80 -0
  404. warp/tests/test_vec_scalar_ops.py +2327 -0
  405. warp/tests/test_verify_fp.py +100 -0
  406. warp/tests/tile/__init__.py +0 -0
  407. warp/tests/tile/test_tile.py +780 -0
  408. warp/tests/tile/test_tile_load.py +407 -0
  409. warp/tests/tile/test_tile_mathdx.py +208 -0
  410. warp/tests/tile/test_tile_mlp.py +402 -0
  411. warp/tests/tile/test_tile_reduce.py +447 -0
  412. warp/tests/tile/test_tile_shared_memory.py +247 -0
  413. warp/tests/tile/test_tile_view.py +173 -0
  414. warp/tests/unittest_serial.py +47 -0
  415. warp/tests/unittest_suites.py +427 -0
  416. warp/tests/unittest_utils.py +468 -0
  417. warp/tests/walkthrough_debug.py +93 -0
  418. warp/thirdparty/__init__.py +0 -0
  419. warp/thirdparty/appdirs.py +598 -0
  420. warp/thirdparty/dlpack.py +145 -0
  421. warp/thirdparty/unittest_parallel.py +570 -0
  422. warp/torch.py +391 -0
  423. warp/types.py +5230 -0
  424. warp/utils.py +1137 -0
  425. warp_lang-1.7.0.dist-info/METADATA +516 -0
  426. warp_lang-1.7.0.dist-info/RECORD +429 -0
  427. warp_lang-1.7.0.dist-info/WHEEL +5 -0
  428. warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
  429. warp_lang-1.7.0.dist-info/top_level.txt +1 -0
warp/native/bvh.cu ADDED
@@ -0,0 +1,791 @@
1
+ /*
2
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ * SPDX-License-Identifier: Apache-2.0
4
+ *
5
+ * Licensed under the Apache License, Version 2.0 (the "License");
6
+ * you may not use this file except in compliance with the License.
7
+ * You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ */
17
+
18
+ #include "warp.h"
19
+ #include "cuda_util.h"
20
+ #include "bvh.h"
21
+ #include "sort.h"
22
+
23
+ #include <vector>
24
+ #include <algorithm>
25
+
26
+ #include <cuda.h>
27
+ #include <cuda_runtime_api.h>
28
+
29
+ #define THRUST_IGNORE_CUB_VERSION_CHECK
30
+ #define REORDER_HOST_TREE
31
+
32
+ #include <cub/cub.cuh>
33
+
34
+
35
+ namespace wp
36
+ {
37
+ void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
38
+ void bvh_destroy_host(BVH& bvh);
39
+
40
+ // for LBVH: this will start with some muted leaf nodes, but that is okay, we can still trace up because there parents information is still valid
41
+ // the only thing worth mentioning is that when the parent leaf node is also a leaf node, we need to recompute its bounds, since their child information are lost
42
+ // for a compact tree such as those from SAH or Median constructor, there is no muted leaf nodes
43
+ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, const int* __restrict__ primitive_indices, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers)
44
+ {
45
+ int index = blockDim.x*blockIdx.x + threadIdx.x;
46
+
47
+ if (index < n)
48
+ {
49
+ bool leaf = node_lowers[index].b;
50
+ int parent = parents[index];
51
+
52
+ if (leaf)
53
+ {
54
+ BVHPackedNodeHalf& lower = node_lowers[index];
55
+ BVHPackedNodeHalf& upper = node_uppers[index];
56
+ // update the leaf node
57
+
58
+ // only need to compute bound when this is a valid leaf node
59
+ if (!node_lowers[parent].b)
60
+ {
61
+ const int start = lower.i;
62
+ const int end = upper.i;
63
+
64
+ bounds3 bound;
65
+ for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
66
+ {
67
+ const int primitive = primitive_indices[primitive_counter];
68
+ bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
69
+ }
70
+ (vec3&)lower = bound.lower;
71
+ (vec3&)upper = bound.upper;
72
+ }
73
+ }
74
+ else
75
+ {
76
+ // only keep leaf threads
77
+ return;
78
+ }
79
+
80
+ // update hierarchy
81
+ for (;;)
82
+ {
83
+ parent = parents[index];
84
+ // reached root
85
+ if (parent == -1)
86
+ return;
87
+
88
+ // ensure all writes are visible
89
+ __threadfence();
90
+
91
+ int finished = atomicAdd(&child_count[parent], 1);
92
+
93
+ // if we have are the last thread (such that the parent node is now complete)
94
+ // then update its bounds and move onto the next parent in the hierarchy
95
+ if (finished == 1)
96
+ {
97
+ BVHPackedNodeHalf& parent_lower = node_lowers[parent];
98
+ BVHPackedNodeHalf& parent_upper = node_uppers[parent];
99
+ if (parent_lower.b)
100
+ // a packed leaf node can still be a parent in LBVH, we need to recompute its bounds
101
+ // since we've lost its left and right child node index in the muting process
102
+ {
103
+ // update the leaf node
104
+ int parent_parent = parents[parent];;
105
+
106
+ // only need to compute bound when this is a valid leaf node
107
+ if (!node_lowers[parent_parent].b)
108
+ {
109
+ const int start = parent_lower.i;
110
+ const int end = parent_upper.i;
111
+ bounds3 bound;
112
+ for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
113
+ {
114
+ const int primitive = primitive_indices[primitive_counter];
115
+ bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
116
+ }
117
+
118
+ (vec3&)parent_lower = bound.lower;
119
+ (vec3&)parent_upper = bound.upper;
120
+ }
121
+ }
122
+ else
123
+ {
124
+ const int left_child = parent_lower.i;
125
+ const int right_child = parent_upper.i;
126
+
127
+ vec3 left_lower = (vec3&)(node_lowers[left_child]);
128
+ vec3 left_upper = (vec3&)(node_uppers[left_child]);
129
+ vec3 right_lower = (vec3&)(node_lowers[right_child]);
130
+ vec3 right_upper = (vec3&)(node_uppers[right_child]);
131
+
132
+ // union of child bounds
133
+ vec3 lower = min(left_lower, right_lower);
134
+ vec3 upper = max(left_upper, right_upper);
135
+
136
+ // write new BVH nodes
137
+ (vec3&)parent_lower = lower;
138
+ (vec3&)parent_upper = upper;
139
+ }
140
+ // move onto processing the parent
141
+ index = parent;
142
+ }
143
+ else
144
+ {
145
+ // parent not ready (we are the first child), terminate thread
146
+ break;
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+
153
+ void bvh_refit_device(BVH& bvh)
154
+ {
155
+ ContextGuard guard(bvh.context);
156
+
157
+ // clear child counters
158
+ memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
159
+ wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
160
+ }
161
+
162
+
163
+ /////////////////////////////////////////////////////////////////////////////////////////////
164
+
165
+ // Create a linear BVH as described in Fast and Simple Agglomerative LBVH construction
166
+ // this is a bottom-up clustering method that outputs one node per-leaf
167
+ //
168
+ class LinearBVHBuilderGPU
169
+ {
170
+ public:
171
+
172
+ LinearBVHBuilderGPU();
173
+ ~LinearBVHBuilderGPU();
174
+
175
+ // takes a bvh (host ref), and pointers to the GPU lower and upper bounds for each triangle
176
+ void build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds);
177
+
178
+ private:
179
+
180
+ // temporary data used during building
181
+ int* indices;
182
+ int* keys;
183
+ int* deltas;
184
+ int* range_lefts;
185
+ int* range_rights;
186
+ int* num_children;
187
+
188
+ // bounds data when total item bounds built on GPU
189
+ vec3* total_lower;
190
+ vec3* total_upper;
191
+ vec3* total_inv_edges;
192
+ };
193
+
194
+ ////////////////////////////////////////////////////////
195
+
196
+
197
+
198
+ __global__ void compute_morton_codes(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const vec3* grid_lower, const vec3* grid_inv_edges, int* __restrict__ indices, int* __restrict__ keys)
199
+ {
200
+ const int index = blockDim.x*blockIdx.x + threadIdx.x;
201
+
202
+ if (index < n)
203
+ {
204
+ vec3 lower = item_lowers[index];
205
+ vec3 upper = item_uppers[index];
206
+
207
+ vec3 center = 0.5f*(lower+upper);
208
+
209
+ vec3 local = cw_mul((center-grid_lower[0]), grid_inv_edges[0]);
210
+
211
+ // 10-bit Morton codes stored in lower 30bits (1024^3 effective resolution)
212
+ int key = morton3<1024>(local[0], local[1], local[2]);
213
+
214
+ indices[index] = index;
215
+ keys[index] = key;
216
+ }
217
+ }
218
+
219
+ // calculate the index of the first differing bit between two adjacent Morton keys
220
+ __global__ void compute_key_deltas(const int* __restrict__ keys, int* __restrict__ deltas, int n)
221
+ {
222
+ const int index = blockDim.x*blockIdx.x + threadIdx.x;
223
+
224
+ if (index < n)
225
+ {
226
+ int a = keys[index];
227
+ int b = keys[index+1];
228
+
229
+ int x = a^b;
230
+
231
+ deltas[index] = x;// __clz(x);
232
+ }
233
+ }
234
+
235
+ __global__ void build_leaves(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const int* __restrict__ indices, int* __restrict__ range_lefts, int* __restrict__ range_rights, BVHPackedNodeHalf* __restrict__ lowers, BVHPackedNodeHalf* __restrict__ uppers)
236
+ {
237
+ const int index = blockDim.x*blockIdx.x + threadIdx.x;
238
+
239
+ if (index < n)
240
+ {
241
+ const int item = indices[index];
242
+
243
+ vec3 lower = item_lowers[item];
244
+ vec3 upper = item_uppers[item];
245
+
246
+ // write leaf nodes
247
+ lowers[index] = make_node(lower, item, true);
248
+ uppers[index] = make_node(upper, item, false);
249
+
250
+ // write leaf key ranges
251
+ range_lefts[index] = index;
252
+ range_rights[index] = index;
253
+ }
254
+ }
255
+
256
+ // this bottom-up process assigns left and right children and combines bounds to form internal nodes
257
+ // there is one thread launched per-leaf node, each thread calculates it's parent node and assigns
258
+ // itself to either the left or right parent slot, the last child to complete the parent and moves
259
+ // up the hierarchy
260
+ __global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas, int* __restrict__ num_children, const int* __restrict__ primitive_indices, volatile int* __restrict__ range_lefts, volatile int* __restrict__ range_rights, volatile int* __restrict__ parents, volatile BVHPackedNodeHalf* __restrict__ lowers, volatile BVHPackedNodeHalf* __restrict__ uppers)
261
+ {
262
+ int index = blockDim.x*blockIdx.x + threadIdx.x;
263
+
264
+ if (index < n)
265
+ {
266
+ const int internal_offset = n;
267
+
268
+ for (;;)
269
+ {
270
+ int left = range_lefts[index];
271
+ int right = range_rights[index];
272
+
273
+ // check if we are the root node, if so then store out our index and terminate
274
+ if (left == 0 && right == n-1)
275
+ {
276
+ *root = index;
277
+ parents[index] = -1;
278
+
279
+ break;
280
+ }
281
+
282
+ int childCount = 0;
283
+
284
+ int parent;
285
+
286
+ bool parent_right = false;
287
+ if (left == 0)
288
+ {
289
+ parent_right = true;
290
+ }
291
+ else if ((right != n - 1 && deltas[right] <= deltas[left - 1]))
292
+ {
293
+ // tie breaking, this avoid always choosing the right node which can result in a very deep tree
294
+ // generate a pseudo-random binary value to randomly choose left or right groupings
295
+ // since the primitives with same Morton code are not sorted at all, determining order based on primitive_indices may also be unreliable.
296
+ // Here, the decision is made using the XOR result of whether the keys before and after the internal node are divisible by 2.
297
+ if (deltas[right] == deltas[left - 1])
298
+ {
299
+ parent_right = (primitive_indices[left - 1] % 2) ^ (primitive_indices[right] % 2);
300
+ }
301
+ else
302
+ {
303
+ parent_right = true;
304
+ }
305
+ }
306
+
307
+ if (parent_right)
308
+ {
309
+ parent = right + internal_offset;
310
+
311
+ // set parent left child
312
+ parents[index] = parent;
313
+ lowers[parent].i = index;
314
+ range_lefts[parent] = left;
315
+
316
+ // ensure above writes are visible to all threads
317
+ __threadfence();
318
+
319
+ childCount = atomicAdd(&num_children[parent], 1);
320
+ }
321
+ else
322
+ {
323
+ parent = left + internal_offset - 1;
324
+
325
+ // set parent right child
326
+ parents[index] = parent;
327
+ uppers[parent].i = index;
328
+ range_rights[parent] = right;
329
+
330
+ // ensure above writes are visible to all threads
331
+ __threadfence();
332
+
333
+ childCount = atomicAdd(&num_children[parent], 1);
334
+ }
335
+
336
+ // if we have are the last thread (such that the parent node is now complete)
337
+ // then update its bounds and move onto the next parent in the hierarchy
338
+ if (childCount == 1)
339
+ {
340
+ const int left_child = lowers[parent].i;
341
+ const int right_child = uppers[parent].i;
342
+
343
+ vec3 left_lower = vec3(lowers[left_child].x,
344
+ lowers[left_child].y,
345
+ lowers[left_child].z);
346
+
347
+ vec3 left_upper = vec3(uppers[left_child].x,
348
+ uppers[left_child].y,
349
+ uppers[left_child].z);
350
+
351
+ vec3 right_lower = vec3(lowers[right_child].x,
352
+ lowers[right_child].y,
353
+ lowers[right_child].z);
354
+
355
+
356
+ vec3 right_upper = vec3(uppers[right_child].x,
357
+ uppers[right_child].y,
358
+ uppers[right_child].z);
359
+
360
+ // bounds_union of child bounds
361
+ vec3 lower = min(left_lower, right_lower);
362
+ vec3 upper = max(left_upper, right_upper);
363
+
364
+ // write new BVH nodes
365
+ make_node(lowers+parent, lower, left_child, false);
366
+ make_node(uppers+parent, upper, right_child, false);
367
+
368
+ // move onto processing the parent
369
+ index = parent;
370
+ }
371
+ else
372
+ {
373
+ // parent not ready (we are the first child), terminate thread
374
+ break;
375
+ }
376
+ }
377
+ }
378
+ }
379
+
380
+ /*
381
+ * LBVH uses a bottom-up constructor which makes variable-sized leaf nodes more challenging to achieve.
382
+ * Simply splitting the ordered primitives into uniform groups of size BVH_LEAF_SIZE will result in poor
383
+ * quality. Instead, after the hierarchy is built, we convert any intermediate node whose size is
384
+ * <= BVH_LEAF_SIZE into a new leaf node. This process is done using the new kernel function called
385
+ * mark_packed_leaf_nodes .
386
+ */
387
+ __global__ void mark_packed_leaf_nodes(int n, const int* __restrict__ range_lefts, const int* __restrict__ range_rights, const int* __restrict__ parents,
388
+ BVHPackedNodeHalf* __restrict__ lowers, BVHPackedNodeHalf* __restrict__ uppers)
389
+ {
390
+ int node_index = blockDim.x * blockIdx.x + threadIdx.x;
391
+ if (node_index < n)
392
+ {
393
+ // mark the node as leaf if its range is less than LEAF_SIZE_LBVH or it is deeper than BVH_QUERY_STACK_SIZE
394
+ // this will forever mute its child nodes so that they will never be accessed
395
+
396
+ // calculate depth
397
+ int depth = 1;
398
+ int parent = parents[node_index];
399
+ while (parent != -1)
400
+ {
401
+ int old_parent = parent;
402
+ parent = parents[parent];
403
+ depth++;
404
+ }
405
+
406
+ int left = range_lefts[node_index];
407
+ // the LBVH constructor's range is defined as left <= i <= right
408
+ // we need to convert it to our convention: left <= i < right
409
+ int right = range_rights[node_index] + 1;
410
+ if (right - left <= BVH_LEAF_SIZE || depth >= BVH_QUERY_STACK_SIZE)
411
+ {
412
+ lowers[node_index].b = 1;
413
+ lowers[node_index].i = left;
414
+ uppers[node_index].i = right;
415
+ }
416
+ }
417
+ }
418
+
419
+
420
+ CUDA_CALLABLE inline vec3 Vec3Max(const vec3& a, const vec3& b) { return wp::max(a, b); }
421
+ CUDA_CALLABLE inline vec3 Vec3Min(const vec3& a, const vec3& b) { return wp::min(a, b); }
422
+
423
+ __global__ void compute_total_bounds(const vec3* item_lowers, const vec3* item_uppers, vec3* total_lower, vec3* total_upper, int num_items)
424
+ {
425
+ typedef cub::BlockReduce<vec3, 256> BlockReduce;
426
+
427
+ __shared__ typename BlockReduce::TempStorage temp_storage;
428
+
429
+ const int blockStart = blockDim.x*blockIdx.x;
430
+ const int numValid = ::min(num_items-blockStart, blockDim.x);
431
+
432
+ const int tid = blockStart + threadIdx.x;
433
+
434
+ if (tid < num_items)
435
+ {
436
+ vec3 lower = item_lowers[tid];
437
+ vec3 upper = item_uppers[tid];
438
+
439
+ vec3 block_upper = BlockReduce(temp_storage).Reduce(upper, Vec3Max, numValid);
440
+
441
+ // sync threads because second reduce uses same temp storage as first
442
+ __syncthreads();
443
+
444
+ vec3 block_lower = BlockReduce(temp_storage).Reduce(lower, Vec3Min, numValid);
445
+
446
+ if (threadIdx.x == 0)
447
+ {
448
+ // write out block results, expanded by the radius
449
+ atomic_max(total_upper, block_upper);
450
+ atomic_min(total_lower, block_lower);
451
+ }
452
+ }
453
+ }
454
+
455
+ // compute inverse edge length, this is just done on the GPU to avoid a CPU->GPU sync point
456
+ __global__ void compute_total_inv_edges(const vec3* total_lower, const vec3* total_upper, vec3* total_inv_edges)
457
+ {
458
+ vec3 edges = (total_upper[0]-total_lower[0]);
459
+ edges += vec3(0.0001f);
460
+
461
+ total_inv_edges[0] = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
462
+ }
463
+
464
+
465
+
466
+ LinearBVHBuilderGPU::LinearBVHBuilderGPU()
467
+ : indices(NULL)
468
+ , keys(NULL)
469
+ , deltas(NULL)
470
+ , range_lefts(NULL)
471
+ , range_rights(NULL)
472
+ , num_children(NULL)
473
+ , total_lower(NULL)
474
+ , total_upper(NULL)
475
+ , total_inv_edges(NULL)
476
+ {
477
+ total_lower = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
478
+ total_upper = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
479
+ total_inv_edges = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
480
+ }
481
+
482
+ LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
483
+ {
484
+ free_device(WP_CURRENT_CONTEXT, total_lower);
485
+ free_device(WP_CURRENT_CONTEXT, total_upper);
486
+ free_device(WP_CURRENT_CONTEXT, total_inv_edges);
487
+ }
488
+
489
+
490
+
491
+ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
492
+ {
493
+ // allocate temporary memory used during building
494
+ indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
495
+ keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
496
+ deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
497
+ range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
498
+ range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
499
+ num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
500
+
501
+ // if total bounds supplied by the host then we just
502
+ // compute our edge length and upload it to the GPU directly
503
+ if (total_bounds)
504
+ {
505
+ // calculate Morton codes
506
+ vec3 edges = (*total_bounds).edges();
507
+ edges += vec3(0.0001f);
508
+
509
+ vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
510
+
511
+ memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
512
+ memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
513
+ memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
514
+ }
515
+ else
516
+ {
517
+ static vec3 upper(-FLT_MAX);
518
+ static vec3 lower(FLT_MAX);
519
+
520
+ memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
521
+ memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
522
+
523
+ // compute the total bounds on the GPU
524
+ wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
525
+
526
+ // compute the total edge length
527
+ wp_launch_device(WP_CURRENT_CONTEXT, compute_total_inv_edges, 1, (total_lower, total_upper, total_inv_edges));
528
+ }
529
+
530
+ // assign 30-bit Morton code based on the centroid of each triangle and bounds for each leaf
531
+ wp_launch_device(WP_CURRENT_CONTEXT, compute_morton_codes, num_items, (item_lowers, item_uppers, num_items, total_lower, total_inv_edges, indices, keys));
532
+
533
+ // sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
534
+ radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
535
+ memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
536
+
537
+ // calculate deltas between adjacent keys
538
+ wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
539
+
540
+ // initialize leaf nodes
541
+ wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
542
+
543
+ // reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
544
+ memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
545
+
546
+ // build the tree and internal node bounds
547
+ wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, bvh.primitive_indices, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
548
+ wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
549
+
550
+ // free temporary memory
551
+ free_device(WP_CURRENT_CONTEXT, indices);
552
+ free_device(WP_CURRENT_CONTEXT, keys);
553
+ free_device(WP_CURRENT_CONTEXT, deltas);
554
+
555
+ free_device(WP_CURRENT_CONTEXT, range_lefts);
556
+ free_device(WP_CURRENT_CONTEXT, range_rights);
557
+ free_device(WP_CURRENT_CONTEXT, num_children);
558
+
559
+ }
560
+
561
+ // buffer_size is the number of T, not the number of bytes
562
+ template<typename T>
563
+ T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
564
+ {
565
+ T* device_buffer = (T*)alloc_device(context, sizeof(T) * buffer_size);;
566
+ memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
567
+
568
+ return device_buffer;
569
+ }
570
+
571
+ void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_host)
572
+ {
573
+ #ifdef REORDER_HOST_TREE
574
+
575
+
576
+ // reorder bvh_host such that its nodes are in the front
577
+ // this is essential for the device refit
578
+ BVHPackedNodeHalf* node_lowers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
579
+ BVHPackedNodeHalf* node_uppers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
580
+
581
+ int* node_parents_reordered = new int[bvh_host.max_nodes];
582
+
583
+ std::vector<int> old_to_new(bvh_host.max_nodes, -1);
584
+
585
+ // We will place nodes in this order:
586
+ // Pass 1: leaf nodes (except if it's the root index)
587
+ // Pass 2: non-leaf, non-root
588
+ // Pass 3: root node
589
+ int next_pos = 0;
590
+
591
+ const int root_index = *bvh_host.root;
592
+ // Pass 1: place leaf nodes at the front
593
+ for (int i = 0; i < bvh_host.num_nodes; ++i)
594
+ {
595
+ if (bvh_host.node_lowers[i].b)
596
+ {
597
+ node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
598
+ node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
599
+ old_to_new[i] = next_pos;
600
+ next_pos++;
601
+ }
602
+ }
603
+
604
+ // Pass 2: place non-leaf, non-root nodes
605
+ for (int i = 0; i < bvh_host.num_nodes; ++i)
606
+ {
607
+ if (i == root_index)
608
+ {
609
+ if (bvh_host.node_lowers[i].b)
610
+ // if root node is leaf node, there must be only be one node
611
+ {
612
+ *bvh_host.root = 0;
613
+ }
614
+ else
615
+ {
616
+ *bvh_host.root = next_pos;
617
+ }
618
+ }
619
+ if (!bvh_host.node_lowers[i].b)
620
+ {
621
+ node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
622
+ node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
623
+ old_to_new[i] = next_pos;
624
+ next_pos++;
625
+ }
626
+ }
627
+
628
+ // We can do that by enumerating all old->new pairs:
629
+ for (int old_index = 0; old_index < bvh_host.num_nodes; ++old_index) {
630
+ int new_index = old_to_new[old_index]; // new index
631
+
632
+ int old_parent = bvh_host.node_parents[old_index];
633
+ if (old_parent != -1)
634
+ {
635
+ node_parents_reordered[new_index] = old_to_new[old_parent];
636
+ }
637
+ else
638
+ {
639
+ node_parents_reordered[new_index] = -1;
640
+ }
641
+
642
+ // only need to modify the child index of non-leaf nodes
643
+ if (!bvh_host.node_lowers[old_index].b)
644
+ {
645
+ node_lowers_reordered[new_index].i = old_to_new[bvh_host.node_lowers[old_index].i];
646
+ node_uppers_reordered[new_index].i = old_to_new[bvh_host.node_uppers[old_index].i];
647
+ }
648
+ }
649
+
650
+ delete[] bvh_host.node_lowers;
651
+ delete[] bvh_host.node_uppers;
652
+ delete[] bvh_host.node_parents;
653
+
654
+ bvh_host.node_lowers = node_lowers_reordered;
655
+ bvh_host.node_uppers = node_uppers_reordered;
656
+ bvh_host.node_parents = node_parents_reordered;
657
+ #endif // REORDER_HOST_TREE
658
+
659
+ bvh_device_on_host.num_nodes = bvh_host.num_nodes;
660
+ bvh_device_on_host.num_leaf_nodes = bvh_host.num_leaf_nodes;
661
+ bvh_device_on_host.max_nodes = bvh_host.max_nodes;
662
+ bvh_device_on_host.num_items = bvh_host.num_items;
663
+ bvh_device_on_host.max_depth = bvh_host.max_depth;
664
+
665
+ bvh_device_on_host.root = (int*)alloc_device(context, sizeof(int));
666
+ memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
667
+ bvh_device_on_host.context = context;
668
+
669
+ bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
670
+ bvh_device_on_host.node_uppers = make_device_buffer_of(context, bvh_host.node_uppers, bvh_host.max_nodes);
671
+ bvh_device_on_host.node_parents = make_device_buffer_of(context, bvh_host.node_parents, bvh_host.max_nodes);
672
+ bvh_device_on_host.primitive_indices = make_device_buffer_of(context, bvh_host.primitive_indices, bvh_host.num_items);
673
+ }
674
+
675
+ // create in-place given existing descriptor
676
+ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh_device_on_host)
677
+ {
678
+ ContextGuard guard(context);
679
+ if (constructor_type == BVH_CONSTRUCTOR_SAH || constructor_type == BVH_CONSTRUCTOR_MEDIAN)
680
+ // CPU based constructors
681
+ {
682
+ // copy bounds back to CPU
683
+ std::vector<vec3> lowers_host(num_items);
684
+ std::vector<vec3> uppers_host(num_items);
685
+ memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
686
+ memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
687
+
688
+ // run CPU based constructor
689
+ wp::BVH bvh_host;
690
+ bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
691
+
692
+ // copy host tree to device
693
+ wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
694
+ // replace host bounds with device bounds
695
+ bvh_device_on_host.item_lowers = lowers;
696
+ bvh_device_on_host.item_uppers = uppers;
697
+ // node_counts is not allocated for host tree
698
+ bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
699
+ bvh_destroy_host(bvh_host);
700
+ }
701
+ else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
702
+ {
703
+ bvh_device_on_host.num_items = num_items;
704
+ bvh_device_on_host.max_nodes = 2 * num_items - 1;
705
+ bvh_device_on_host.num_leaf_nodes = num_items;
706
+ bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
707
+ memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
708
+ bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
709
+ memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
710
+ bvh_device_on_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
711
+ bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
712
+ bvh_device_on_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
713
+ bvh_device_on_host.primitive_indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
714
+ bvh_device_on_host.item_lowers = lowers;
715
+ bvh_device_on_host.item_uppers = uppers;
716
+
717
+ bvh_device_on_host.context = context ? context : cuda_context_get_current();
718
+
719
+ LinearBVHBuilderGPU builder;
720
+ builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
721
+ }
722
+ else
723
+ {
724
+ printf("Unrecognized Constructor type: %d! For GPU constructor it should be SAH (0), Median (1), or LBVH (2)!\n", constructor_type);
725
+ }
726
+ }
727
+
728
+ void bvh_destroy_device(BVH& bvh)
729
+ {
730
+ ContextGuard guard(bvh.context);
731
+
732
+ free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
733
+ free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
734
+ free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
735
+ free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
736
+ free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
737
+ free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
738
+ }
739
+
740
+
741
+ } // namespace wp
742
+
743
+
744
+ void bvh_refit_device(uint64_t id)
745
+ {
746
+ wp::BVH bvh;
747
+ if (bvh_get_descriptor(id, bvh))
748
+ {
749
+ ContextGuard guard(bvh.context);
750
+
751
+ bvh_refit_device(bvh);
752
+ }
753
+ }
754
+
755
+ /*
756
+ * Since we don't even know the number of true leaf nodes, never mention where they are, we will launch
757
+ * the num_items threads, which are identical to the number of leaf nodes in the original tree. The
758
+ * refitting threads will start from the nodes corresponding to the original leaf nodes, which might be
759
+ * muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
760
+ * can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
761
+ */
762
+ uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
763
+ {
764
+ ContextGuard guard(context);
765
+ wp::BVH bvh_device_on_host;
766
+ wp::BVH* bvh_device_ptr = nullptr;
767
+
768
+ bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
769
+
770
+ // create device-side BVH descriptor
771
+ bvh_device_ptr = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
772
+ memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
773
+
774
+ uint64_t bvh_id = (uint64_t)bvh_device_ptr;
775
+ wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
776
+ return bvh_id;
777
+ }
778
+
779
+
780
+ void bvh_destroy_device(uint64_t id)
781
+ {
782
+ wp::BVH bvh;
783
+ if (wp::bvh_get_descriptor(id, bvh))
784
+ {
785
+ wp::bvh_destroy_device(bvh);
786
+ wp::bvh_rem_descriptor(id);
787
+
788
+ // free descriptor
789
+ free_device(WP_CURRENT_CONTEXT, (void*)id);
790
+ }
791
+ }